acryl-datahub 0.14.1.13rc8__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (139) hide show
  1. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2506 -2456
  2. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +136 -131
  3. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
  4. datahub/__init__.py +1 -1
  5. datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
  6. datahub/cli/cli_utils.py +2 -0
  7. datahub/cli/delete_cli.py +103 -24
  8. datahub/cli/ingest_cli.py +110 -0
  9. datahub/cli/put_cli.py +1 -1
  10. datahub/cli/specific/dataproduct_cli.py +1 -1
  11. datahub/cli/specific/structuredproperties_cli.py +2 -1
  12. datahub/configuration/common.py +3 -3
  13. datahub/configuration/git.py +7 -1
  14. datahub/configuration/kafka_consumer_config.py +31 -1
  15. datahub/emitter/mcp_patch_builder.py +43 -0
  16. datahub/emitter/rest_emitter.py +17 -4
  17. datahub/ingestion/api/incremental_properties_helper.py +69 -0
  18. datahub/ingestion/api/source.py +6 -1
  19. datahub/ingestion/api/source_helpers.py +4 -2
  20. datahub/ingestion/graph/client.py +2 -0
  21. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
  22. datahub/ingestion/run/pipeline.py +6 -5
  23. datahub/ingestion/run/pipeline_config.py +6 -0
  24. datahub/ingestion/sink/datahub_rest.py +15 -4
  25. datahub/ingestion/source/abs/source.py +4 -0
  26. datahub/ingestion/source/aws/aws_common.py +13 -1
  27. datahub/ingestion/source/aws/sagemaker.py +8 -0
  28. datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
  29. datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
  30. datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
  31. datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
  32. datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
  33. datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
  34. datahub/ingestion/source/bigquery_v2/bigquery_schema.py +0 -1
  35. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +0 -21
  36. datahub/ingestion/source/bigquery_v2/profiler.py +0 -6
  37. datahub/ingestion/source/common/subtypes.py +2 -0
  38. datahub/ingestion/source/csv_enricher.py +1 -1
  39. datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
  40. datahub/ingestion/source/datahub/datahub_source.py +8 -1
  41. datahub/ingestion/source/dbt/dbt_common.py +7 -61
  42. datahub/ingestion/source/dremio/dremio_api.py +204 -86
  43. datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
  44. datahub/ingestion/source/dremio/dremio_config.py +5 -0
  45. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
  46. datahub/ingestion/source/dremio/dremio_entities.py +4 -0
  47. datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
  48. datahub/ingestion/source/dremio/dremio_source.py +7 -2
  49. datahub/ingestion/source/elastic_search.py +1 -1
  50. datahub/ingestion/source/feast.py +97 -6
  51. datahub/ingestion/source/gc/datahub_gc.py +46 -35
  52. datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
  53. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
  54. datahub/ingestion/source/ge_data_profiler.py +46 -9
  55. datahub/ingestion/source/ge_profiling_config.py +5 -0
  56. datahub/ingestion/source/iceberg/iceberg.py +12 -5
  57. datahub/ingestion/source/kafka/kafka.py +39 -19
  58. datahub/ingestion/source/kafka/kafka_connect.py +81 -51
  59. datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
  60. datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
  61. datahub/ingestion/source/looker/view_upstream.py +65 -30
  62. datahub/ingestion/source/metadata/business_glossary.py +35 -18
  63. datahub/ingestion/source/mode.py +0 -23
  64. datahub/ingestion/source/neo4j/__init__.py +0 -0
  65. datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
  66. datahub/ingestion/source/powerbi/__init__.py +0 -1
  67. datahub/ingestion/source/powerbi/config.py +3 -3
  68. datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
  69. datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
  70. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
  71. datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
  72. datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
  73. datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
  74. datahub/ingestion/source/powerbi/powerbi.py +12 -6
  75. datahub/ingestion/source/preset.py +1 -0
  76. datahub/ingestion/source/pulsar.py +21 -2
  77. datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
  78. datahub/ingestion/source/redash.py +13 -63
  79. datahub/ingestion/source/redshift/config.py +1 -0
  80. datahub/ingestion/source/redshift/redshift.py +3 -0
  81. datahub/ingestion/source/s3/source.py +2 -3
  82. datahub/ingestion/source/sigma/data_classes.py +1 -0
  83. datahub/ingestion/source/sigma/sigma.py +101 -43
  84. datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
  85. datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
  86. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
  87. datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
  88. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  89. datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
  90. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
  91. datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
  92. datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
  93. datahub/ingestion/source/sql/athena.py +46 -22
  94. datahub/ingestion/source/sql/mssql/source.py +18 -6
  95. datahub/ingestion/source/sql/sql_common.py +34 -21
  96. datahub/ingestion/source/sql/sql_report.py +1 -0
  97. datahub/ingestion/source/sql/sql_types.py +85 -8
  98. datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
  99. datahub/ingestion/source/superset.py +215 -65
  100. datahub/ingestion/source/tableau/tableau.py +237 -76
  101. datahub/ingestion/source/tableau/tableau_common.py +12 -6
  102. datahub/ingestion/source/tableau/tableau_constant.py +2 -0
  103. datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
  104. datahub/ingestion/source/tableau/tableau_validation.py +48 -0
  105. datahub/ingestion/source/unity/proxy_types.py +1 -0
  106. datahub/ingestion/source/unity/source.py +4 -0
  107. datahub/ingestion/source/unity/usage.py +20 -11
  108. datahub/ingestion/transformer/add_dataset_tags.py +1 -1
  109. datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
  110. datahub/integrations/assertion/common.py +1 -1
  111. datahub/lite/duckdb_lite.py +12 -17
  112. datahub/metadata/_schema_classes.py +512 -392
  113. datahub/metadata/_urns/urn_defs.py +1355 -1355
  114. datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
  115. datahub/metadata/schema.avsc +17222 -17499
  116. datahub/metadata/schemas/FormInfo.avsc +4 -0
  117. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
  118. datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
  119. datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
  120. datahub/specific/chart.py +0 -39
  121. datahub/specific/dashboard.py +0 -39
  122. datahub/specific/datajob.py +7 -57
  123. datahub/sql_parsing/schema_resolver.py +23 -0
  124. datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
  125. datahub/sql_parsing/sqlglot_lineage.py +55 -14
  126. datahub/sql_parsing/sqlglot_utils.py +8 -2
  127. datahub/telemetry/telemetry.py +23 -9
  128. datahub/testing/compare_metadata_json.py +1 -1
  129. datahub/testing/doctest.py +12 -0
  130. datahub/utilities/file_backed_collections.py +35 -2
  131. datahub/utilities/partition_executor.py +1 -1
  132. datahub/utilities/urn_encoder.py +2 -1
  133. datahub/utilities/urns/_urn_base.py +1 -1
  134. datahub/utilities/urns/structured_properties_urn.py +1 -1
  135. datahub/utilities/sql_lineage_parser_impl.py +0 -160
  136. datahub/utilities/sql_parser.py +0 -94
  137. datahub/utilities/sql_parser_base.py +0 -21
  138. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
  139. {acryl_datahub-0.14.1.13rc8.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
1
  import concurrent.futures
2
2
  import json
3
3
  import logging
4
+ import re
4
5
  import warnings
5
6
  from collections import defaultdict
6
7
  from enum import Enum
@@ -609,32 +610,6 @@ class DremioAPIOperations:
609
610
 
610
611
  return self.execute_query(query=jobs_query)
611
612
 
612
- def get_source_by_id(self, source_id: str) -> Optional[Dict]:
613
- """
614
- Fetch source details by ID.
615
- """
616
- response = self.get(
617
- url=f"/source/{source_id}",
618
- )
619
- return response if response else None
620
-
621
- def get_source_for_dataset(self, schema: str, dataset: str) -> Optional[Dict]:
622
- """
623
- Get source information for a dataset given its schema and name.
624
- """
625
- dataset_id = self.get_dataset_id(schema, dataset)
626
- if not dataset_id:
627
- return None
628
-
629
- catalog_entry = self.get(
630
- url=f"/catalog/{dataset_id}",
631
- )
632
- if not catalog_entry or "path" not in catalog_entry:
633
- return None
634
-
635
- source_id = catalog_entry["path"][0]
636
- return self.get_source_by_id(source_id)
637
-
638
613
  def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
639
614
  """
640
615
  Get Dremio tags for a given resource_id.
@@ -673,55 +648,119 @@ class DremioAPIOperations:
673
648
  )
674
649
  return None
675
650
 
676
- def get_containers_for_location(
677
- self, resource_id: str, path: List[str]
678
- ) -> List[Dict[str, str]]:
679
- containers = []
651
+ def _check_pattern_match(
652
+ self,
653
+ pattern: str,
654
+ paths: List[str],
655
+ allow_prefix: bool = True,
656
+ ) -> bool:
657
+ """
658
+ Helper method to check if a pattern matches any of the paths.
659
+ Handles hierarchical matching where each level is matched independently.
660
+ Also handles prefix matching for partial paths.
661
+ """
662
+ if pattern == ".*":
663
+ return True
680
664
 
681
- def traverse_path(location_id: str, entity_path: List[str]) -> List:
682
- nonlocal containers
683
- try:
684
- response = self.get(url=f"/catalog/{location_id}")
685
- if (
686
- response.get("entityType")
687
- == DremioEntityContainerType.FOLDER.value.lower()
688
- ):
689
- containers.append(
690
- {
691
- "id": location_id,
692
- "name": entity_path[-1],
693
- "path": entity_path[:-1],
694
- "container_type": DremioEntityContainerType.FOLDER,
695
- }
696
- )
665
+ # Convert the pattern to regex with proper anchoring
666
+ regex_pattern = pattern
667
+ if pattern.startswith("^"):
668
+ # Already has start anchor
669
+ regex_pattern = pattern.replace(".", r"\.") # Escape dots
670
+ regex_pattern = regex_pattern.replace(
671
+ r"\.*", ".*"
672
+ ) # Convert .* to wildcard
673
+ else:
674
+ # Add start anchor and handle dots
675
+ regex_pattern = "^" + pattern.replace(".", r"\.").replace(r"\.*", ".*")
676
+
677
+ # Handle end matching
678
+ if not pattern.endswith(".*"):
679
+ if pattern.endswith("$"):
680
+ # Keep explicit end anchor
681
+ pass
682
+ elif not allow_prefix:
683
+ # Add end anchor for exact matching
684
+ regex_pattern = regex_pattern + "$"
685
+
686
+ for path in paths:
687
+ if re.match(regex_pattern, path, re.IGNORECASE):
688
+ return True
697
689
 
698
- for container in response.get("children", []):
699
- if (
700
- container.get("type")
701
- == DremioEntityContainerType.CONTAINER.value
702
- ):
703
- traverse_path(container.get("id"), container.get("path"))
690
+ return False
704
691
 
705
- except Exception as exc:
706
- logging.info(
707
- "Location {} contains no tables or views. Skipping...".format(id)
708
- )
709
- self.report.warning(
710
- message="Failed to get tables or views",
711
- context=f"{id}",
712
- exc=exc,
713
- )
692
+ def should_include_container(self, path: List[str], name: str) -> bool:
693
+ """
694
+ Helper method to check if a container should be included based on schema patterns.
695
+ Used by both get_all_containers and get_containers_for_location.
696
+ """
697
+ path_components = path + [name] if path else [name]
698
+ full_path = ".".join(path_components)
714
699
 
715
- return containers
700
+ # Default allow everything case
701
+ if self.allow_schema_pattern == [".*"] and not self.deny_schema_pattern:
702
+ self.report.report_container_scanned(full_path)
703
+ return True
716
704
 
717
- return traverse_path(location_id=resource_id, entity_path=path)
705
+ # Check deny patterns first
706
+ if self.deny_schema_pattern:
707
+ for pattern in self.deny_schema_pattern:
708
+ if self._check_pattern_match(
709
+ pattern=pattern,
710
+ paths=[full_path],
711
+ allow_prefix=False,
712
+ ):
713
+ self.report.report_container_filtered(full_path)
714
+ return False
715
+
716
+ # Check allow patterns
717
+ for pattern in self.allow_schema_pattern:
718
+ # For patterns with wildcards, check if this path is a parent of the pattern
719
+ if "*" in pattern:
720
+ pattern_parts = pattern.split(".")
721
+ path_parts = path_components
722
+
723
+ # If pattern has exact same number of parts, check each component
724
+ if len(pattern_parts) == len(path_parts):
725
+ matches = True
726
+ for p_part, c_part in zip(pattern_parts, path_parts):
727
+ if p_part != "*" and p_part.lower() != c_part.lower():
728
+ matches = False
729
+ break
730
+ if matches:
731
+ self.report.report_container_scanned(full_path)
732
+ return True
733
+ # Otherwise check if current path is prefix match
734
+ else:
735
+ # Remove the trailing wildcard if present
736
+ if pattern_parts[-1] == "*":
737
+ pattern_parts = pattern_parts[:-1]
738
+
739
+ for i in range(len(path_parts)):
740
+ current_path = ".".join(path_parts[: i + 1])
741
+ pattern_prefix = ".".join(pattern_parts[: i + 1])
742
+
743
+ if pattern_prefix.startswith(current_path):
744
+ self.report.report_container_scanned(full_path)
745
+ return True
746
+
747
+ # Direct pattern matching
748
+ if self._check_pattern_match(
749
+ pattern=pattern,
750
+ paths=[full_path],
751
+ allow_prefix=True,
752
+ ):
753
+ self.report.report_container_scanned(full_path)
754
+ return True
755
+
756
+ self.report.report_container_filtered(full_path)
757
+ return False
718
758
 
719
759
  def get_all_containers(self):
720
760
  """
721
- Query the Dremio sources API and return source information.
761
+ Query the Dremio sources API and return filtered source information.
722
762
  """
723
763
  containers = []
724
-
725
764
  response = self.get(url="/catalog")
726
765
 
727
766
  def process_source(source):
@@ -731,34 +770,41 @@ class DremioAPIOperations:
731
770
  )
732
771
 
733
772
  source_config = source_resp.get("config", {})
734
- if source_config.get("database"):
735
- db = source_config.get("database")
736
- else:
737
- db = source_config.get("databaseName", "")
738
-
739
- return {
740
- "id": source.get("id"),
741
- "name": source.get("path")[0],
742
- "path": [],
743
- "container_type": DremioEntityContainerType.SOURCE,
744
- "source_type": source_resp.get("type"),
745
- "root_path": source_config.get("rootPath"),
746
- "database_name": db,
747
- }
773
+ db = source_config.get(
774
+ "database", source_config.get("databaseName", "")
775
+ )
776
+
777
+ if self.should_include_container([], source.get("path")[0]):
778
+ return {
779
+ "id": source.get("id"),
780
+ "name": source.get("path")[0],
781
+ "path": [],
782
+ "container_type": DremioEntityContainerType.SOURCE,
783
+ "source_type": source_resp.get("type"),
784
+ "root_path": source_config.get("rootPath"),
785
+ "database_name": db,
786
+ }
748
787
  else:
749
- return {
750
- "id": source.get("id"),
751
- "name": source.get("path")[0],
752
- "path": [],
753
- "container_type": DremioEntityContainerType.SPACE,
754
- }
788
+ if self.should_include_container([], source.get("path")[0]):
789
+ return {
790
+ "id": source.get("id"),
791
+ "name": source.get("path")[0],
792
+ "path": [],
793
+ "container_type": DremioEntityContainerType.SPACE,
794
+ }
795
+ return None
755
796
 
756
797
  def process_source_and_containers(source):
757
798
  container = process_source(source)
799
+ if not container:
800
+ return []
801
+
802
+ # Get sub-containers
758
803
  sub_containers = self.get_containers_for_location(
759
804
  resource_id=container.get("id"),
760
805
  path=[container.get("name")],
761
806
  )
807
+
762
808
  return [container] + sub_containers
763
809
 
764
810
  # Use ThreadPoolExecutor to parallelize the processing of sources
@@ -771,6 +817,78 @@ class DremioAPIOperations:
771
817
  }
772
818
 
773
819
  for future in concurrent.futures.as_completed(future_to_source):
774
- containers.extend(future.result())
820
+ source = future_to_source[future]
821
+ try:
822
+ containers.extend(future.result())
823
+ except Exception as exc:
824
+ logger.error(f"Error processing source: {exc}")
825
+ self.report.warning(
826
+ message="Failed to process source",
827
+ context=f"{source}",
828
+ exc=exc,
829
+ )
775
830
 
776
831
  return containers
832
+
833
+ def get_context_for_vds(self, resource_id: str) -> str:
834
+ context_array = self.get(
835
+ url=f"/catalog/{resource_id}",
836
+ ).get("sqlContext")
837
+ if context_array:
838
+ return ".".join(
839
+ f'"{part}"' if "." in part else f"{part}" for part in context_array
840
+ )
841
+ else:
842
+ return ""
843
+
844
+ def get_containers_for_location(
845
+ self, resource_id: str, path: List[str]
846
+ ) -> List[Dict[str, str]]:
847
+ containers = []
848
+
849
+ def traverse_path(location_id: str, entity_path: List[str]) -> List:
850
+ nonlocal containers
851
+ try:
852
+ response = self.get(url=f"/catalog/{location_id}")
853
+
854
+ # Check if current folder should be included
855
+ if (
856
+ response.get("entityType")
857
+ == DremioEntityContainerType.FOLDER.value.lower()
858
+ ):
859
+ folder_name = entity_path[-1]
860
+ folder_path = entity_path[:-1]
861
+
862
+ if self.should_include_container(folder_path, folder_name):
863
+ containers.append(
864
+ {
865
+ "id": location_id,
866
+ "name": folder_name,
867
+ "path": folder_path,
868
+ "container_type": DremioEntityContainerType.FOLDER,
869
+ }
870
+ )
871
+
872
+ # Recursively process child containers
873
+ for container in response.get("children", []):
874
+ if (
875
+ container.get("type")
876
+ == DremioEntityContainerType.CONTAINER.value
877
+ ):
878
+ traverse_path(container.get("id"), container.get("path"))
879
+
880
+ except Exception as exc:
881
+ logging.info(
882
+ "Location {} contains no tables or views. Skipping...".format(
883
+ location_id
884
+ )
885
+ )
886
+ self.report.warning(
887
+ message="Failed to get tables or views",
888
+ context=f"{location_id}",
889
+ exc=exc,
890
+ )
891
+
892
+ return containers
893
+
894
+ return traverse_path(location_id=resource_id, entity_path=path)
@@ -142,6 +142,7 @@ class DremioAspects:
142
142
  platform: str,
143
143
  ui_url: str,
144
144
  env: str,
145
+ ingest_owner: bool,
145
146
  domain: Optional[str] = None,
146
147
  platform_instance: Optional[str] = None,
147
148
  ):
@@ -150,6 +151,7 @@ class DremioAspects:
150
151
  self.env = env
151
152
  self.domain = domain
152
153
  self.ui_url = ui_url
154
+ self.ingest_owner = ingest_owner
153
155
 
154
156
  def get_container_key(
155
157
  self, name: Optional[str], path: Optional[List[str]]
@@ -426,21 +428,23 @@ class DremioAspects:
426
428
  return f'{self.ui_url}/{container_type}/{dataset_url_path}"{dataset.resource_name}"'
427
429
 
428
430
  def _create_ownership(self, dataset: DremioDataset) -> Optional[OwnershipClass]:
429
- if not dataset.owner:
430
- return None
431
- owner = (
432
- make_user_urn(dataset.owner)
433
- if dataset.owner_type == "USER"
434
- else make_group_urn(dataset.owner)
435
- )
436
- return OwnershipClass(
437
- owners=[
438
- OwnerClass(
439
- owner=owner,
440
- type=OwnershipTypeClass.TECHNICAL_OWNER,
441
- )
442
- ]
443
- )
431
+ if self.ingest_owner and dataset.owner:
432
+ owner_urn = (
433
+ make_user_urn(dataset.owner)
434
+ if dataset.owner_type == "USER"
435
+ else make_group_urn(dataset.owner)
436
+ )
437
+ ownership: OwnershipClass = OwnershipClass(
438
+ owners=[
439
+ OwnerClass(
440
+ owner=owner_urn,
441
+ type=OwnershipTypeClass.TECHNICAL_OWNER,
442
+ )
443
+ ]
444
+ )
445
+ return ownership
446
+
447
+ return None
444
448
 
445
449
  def _create_glossary_terms(self, entity: DremioDataset) -> GlossaryTermsClass:
446
450
  return GlossaryTermsClass(
@@ -174,3 +174,8 @@ class DremioSourceConfig(
174
174
  default=False,
175
175
  description="Whether to include query-based lineage information.",
176
176
  )
177
+
178
+ ingest_owner: bool = Field(
179
+ default=True,
180
+ description="Ingest Owner from source. This will override Owner info entered from UI",
181
+ )
@@ -31,6 +31,7 @@ class DremioToDataHubSourceTypeMapping:
31
31
  "SNOWFLAKE": "snowflake",
32
32
  "SYNAPSE": "mssql",
33
33
  "TERADATA": "teradata",
34
+ "VERTICA": "vertica",
34
35
  }
35
36
 
36
37
  DATABASE_SOURCE_TYPES = {
@@ -52,6 +53,7 @@ class DremioToDataHubSourceTypeMapping:
52
53
  "SNOWFLAKE",
53
54
  "SYNAPSE",
54
55
  "TERADATA",
56
+ "VERTICA",
55
57
  }
56
58
 
57
59
  FILE_OBJECT_STORAGE_TYPES = {
@@ -200,6 +200,7 @@ class DremioDataset:
200
200
  columns: List[DremioDatasetColumn]
201
201
  sql_definition: Optional[str]
202
202
  dataset_type: DremioDatasetType
203
+ default_schema: Optional[str]
203
204
  owner: Optional[str]
204
205
  owner_type: Optional[str]
205
206
  created: str
@@ -235,6 +236,9 @@ class DremioDataset:
235
236
 
236
237
  if self.sql_definition:
237
238
  self.dataset_type = DremioDatasetType.VIEW
239
+ self.default_schema = api_operations.get_context_for_vds(
240
+ resource_id=self.resource_id
241
+ )
238
242
  else:
239
243
  self.dataset_type = DremioDatasetType.TABLE
240
244
 
@@ -14,12 +14,27 @@ class DremioSourceReport(
14
14
  ):
15
15
  num_containers_failed: int = 0
16
16
  num_datasets_failed: int = 0
17
+ containers_scanned: int = 0
18
+ containers_filtered: int = 0
17
19
 
18
20
  def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
19
21
  # recording total combined latency is not very useful, keeping this method as a placeholder
20
22
  # for future implementation of min / max / percentiles etc.
21
23
  pass
22
24
 
25
+ def report_container_scanned(self, name: str) -> None:
26
+ """
27
+ Record that a container was successfully scanned
28
+ """
29
+ self.containers_scanned += 1
30
+
31
+ def report_container_filtered(self, container_name: str) -> None:
32
+ """
33
+ Record that a container was filtered out
34
+ """
35
+ self.containers_filtered += 1
36
+ self.report_dropped(container_name)
37
+
23
38
  def report_entity_scanned(self, name: str, ent_type: str = "View") -> None:
24
39
  """
25
40
  Entity could be a view or a table
@@ -97,6 +97,7 @@ class DremioSource(StatefulIngestionSourceBase):
97
97
  - Ownership and Glossary Terms:
98
98
  - Metadata related to ownership of datasets, extracted from Dremio’s ownership model.
99
99
  - Glossary terms and business metadata associated with datasets, providing additional context to the data.
100
+ - Note: Ownership information will only be available for the Cloud and Enterprise editions, it will not be available for the Community edition.
100
101
 
101
102
  - Optional SQL Profiling (if enabled):
102
103
  - Table, row, and column statistics can be profiled and ingested via optional SQL queries.
@@ -123,6 +124,7 @@ class DremioSource(StatefulIngestionSourceBase):
123
124
  self.dremio_aspects = DremioAspects(
124
125
  platform=self.get_platform(),
125
126
  domain=self.config.domain,
127
+ ingest_owner=self.config.ingest_owner,
126
128
  platform_instance=self.config.platform_instance,
127
129
  env=self.config.env,
128
130
  ui_url=dremio_api.ui_url,
@@ -394,10 +396,12 @@ class DremioSource(StatefulIngestionSourceBase):
394
396
  ):
395
397
  yield dremio_mcp
396
398
  # Check if the emitted aspect is SchemaMetadataClass
397
- if isinstance(dremio_mcp.metadata, SchemaMetadataClass):
399
+ if isinstance(
400
+ dremio_mcp.metadata, MetadataChangeProposalWrapper
401
+ ) and isinstance(dremio_mcp.metadata.aspect, SchemaMetadataClass):
398
402
  self.sql_parsing_aggregator.register_schema(
399
403
  urn=dataset_urn,
400
- schema=dremio_mcp.metadata,
404
+ schema=dremio_mcp.metadata.aspect,
401
405
  )
402
406
 
403
407
  if dataset_info.dataset_type == DremioDatasetType.VIEW:
@@ -415,6 +419,7 @@ class DremioSource(StatefulIngestionSourceBase):
415
419
  view_urn=dataset_urn,
416
420
  view_definition=dataset_info.sql_definition,
417
421
  default_db=self.default_db,
422
+ default_schema=dataset_info.default_schema,
418
423
  )
419
424
 
420
425
  elif dataset_info.dataset_type == DremioDatasetType.TABLE:
@@ -227,7 +227,7 @@ def collapse_name(name: str, collapse_urns: CollapseUrns) -> str:
227
227
  def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
228
228
  if len(collapse_urns.urns_suffix_regex) == 0:
229
229
  return urn
230
- urn_obj = DatasetUrn.create_from_string(urn)
230
+ urn_obj = DatasetUrn.from_string(urn)
231
231
  name = collapse_name(name=urn_obj.get_dataset_name(), collapse_urns=collapse_urns)
232
232
  data_platform_urn = urn_obj.get_data_platform_urn()
233
233
  return str(
@@ -42,10 +42,14 @@ from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
42
42
  from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
43
43
  from datahub.metadata.schema_classes import (
44
44
  BrowsePathsClass,
45
+ GlobalTagsClass,
45
46
  MLFeaturePropertiesClass,
46
47
  MLFeatureTablePropertiesClass,
47
48
  MLPrimaryKeyPropertiesClass,
49
+ OwnerClass,
50
+ OwnershipClass,
48
51
  StatusClass,
52
+ TagAssociationClass,
49
53
  )
50
54
 
51
55
  # FIXME: ValueType module cannot be used as a type
@@ -91,6 +95,24 @@ class FeastRepositorySourceConfig(ConfigModel):
91
95
  environment: str = Field(
92
96
  default=DEFAULT_ENV, description="Environment to use when constructing URNs"
93
97
  )
98
+ # owner_mappings example:
99
+ # This must be added to the recipe in order to extract owners, otherwise NO owners will be extracted
100
+ # owner_mappings:
101
+ # - feast_owner_name: "<owner>"
102
+ # datahub_owner_urn: "urn:li:corpGroup:<owner>"
103
+ # datahub_ownership_type: "BUSINESS_OWNER"
104
+ owner_mappings: Optional[List[Dict[str, str]]] = Field(
105
+ default=None, description="Mapping of owner names to owner types"
106
+ )
107
+ enable_owner_extraction: bool = Field(
108
+ default=False,
109
+ description="If this is disabled, then we NEVER try to map owners. "
110
+ "If this is enabled, then owner_mappings is REQUIRED to extract ownership.",
111
+ )
112
+ enable_tag_extraction: bool = Field(
113
+ default=False,
114
+ description="If this is disabled, then we NEVER try to extract tags.",
115
+ )
94
116
 
95
117
 
96
118
  @platform_name("Feast")
@@ -215,10 +237,15 @@ class FeastRepositorySource(Source):
215
237
  """
216
238
 
217
239
  feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
240
+ aspects = (
241
+ [StatusClass(removed=False)]
242
+ + self._get_tags(entity)
243
+ + self._get_owners(entity)
244
+ )
218
245
 
219
246
  entity_snapshot = MLPrimaryKeySnapshot(
220
247
  urn=builder.make_ml_primary_key_urn(feature_view_name, entity.name),
221
- aspects=[StatusClass(removed=False)],
248
+ aspects=aspects,
222
249
  )
223
250
 
224
251
  entity_snapshot.aspects.append(
@@ -243,10 +270,11 @@ class FeastRepositorySource(Source):
243
270
  Generate an MLFeature work unit for a Feast feature.
244
271
  """
245
272
  feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
273
+ aspects = [StatusClass(removed=False)] + self._get_tags(field)
246
274
 
247
275
  feature_snapshot = MLFeatureSnapshot(
248
276
  urn=builder.make_ml_feature_urn(feature_view_name, field.name),
249
- aspects=[StatusClass(removed=False)],
277
+ aspects=aspects,
250
278
  )
251
279
 
252
280
  feature_sources = []
@@ -295,13 +323,18 @@ class FeastRepositorySource(Source):
295
323
  """
296
324
 
297
325
  feature_view_name = f"{self.feature_store.project}.{feature_view.name}"
326
+ aspects = (
327
+ [
328
+ BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]),
329
+ StatusClass(removed=False),
330
+ ]
331
+ + self._get_tags(feature_view)
332
+ + self._get_owners(feature_view)
333
+ )
298
334
 
299
335
  feature_view_snapshot = MLFeatureTableSnapshot(
300
336
  urn=builder.make_ml_feature_table_urn("feast", feature_view_name),
301
- aspects=[
302
- BrowsePathsClass(paths=[f"/feast/{self.feature_store.project}"]),
303
- StatusClass(removed=False),
304
- ],
337
+ aspects=aspects,
305
338
  )
306
339
 
307
340
  feature_view_snapshot.aspects.append(
@@ -360,6 +393,64 @@ class FeastRepositorySource(Source):
360
393
 
361
394
  return MetadataWorkUnit(id=on_demand_feature_view_name, mce=mce)
362
395
 
396
+ # If a tag is specified in a Feast object, then the tag will be ingested into Datahub if enable_tag_extraction is
397
+ # True, otherwise NO tags will be ingested
398
+ def _get_tags(self, obj: Union[Entity, FeatureView, FeastField]) -> list:
399
+ """
400
+ Extracts tags from the given object and returns a list of aspects.
401
+ """
402
+ aspects: List[Union[GlobalTagsClass]] = []
403
+
404
+ # Extract tags
405
+ if self.source_config.enable_tag_extraction:
406
+ if obj.tags.get("name"):
407
+ tag_name: str = obj.tags["name"]
408
+ tag_association = TagAssociationClass(
409
+ tag=builder.make_tag_urn(tag_name)
410
+ )
411
+ global_tags_aspect = GlobalTagsClass(tags=[tag_association])
412
+ aspects.append(global_tags_aspect)
413
+
414
+ return aspects
415
+
416
+ # If an owner is specified in a Feast object, it will only be ingested into Datahub if owner_mappings is specified
417
+ # and enable_owner_extraction is True in FeastRepositorySourceConfig, otherwise NO owners will be ingested
418
+ def _get_owners(self, obj: Union[Entity, FeatureView, FeastField]) -> list:
419
+ """
420
+ Extracts owners from the given object and returns a list of aspects.
421
+ """
422
+ aspects: List[Union[OwnershipClass]] = []
423
+
424
+ # Extract owner
425
+ if self.source_config.enable_owner_extraction:
426
+ owner = getattr(obj, "owner", None)
427
+ if owner:
428
+ # Create owner association, skipping if None
429
+ owner_association = self._create_owner_association(owner)
430
+ if owner_association: # Only add valid owner associations
431
+ owners_aspect = OwnershipClass(owners=[owner_association])
432
+ aspects.append(owners_aspect)
433
+
434
+ return aspects
435
+
436
+ def _create_owner_association(self, owner: str) -> Optional[OwnerClass]:
437
+ """
438
+ Create an OwnerClass instance for the given owner using the owner mappings.
439
+ """
440
+ if self.source_config.owner_mappings is not None:
441
+ for mapping in self.source_config.owner_mappings:
442
+ if mapping["feast_owner_name"] == owner:
443
+ ownership_type_class: str = mapping.get(
444
+ "datahub_ownership_type", "TECHNICAL_OWNER"
445
+ )
446
+ datahub_owner_urn = mapping.get("datahub_owner_urn")
447
+ if datahub_owner_urn:
448
+ return OwnerClass(
449
+ owner=datahub_owner_urn,
450
+ type=ownership_type_class,
451
+ )
452
+ return None
453
+
363
454
  @classmethod
364
455
  def create(cls, config_dict, ctx):
365
456
  config = FeastRepositorySourceConfig.parse_obj(config_dict)