acryl-datahub 0.15.0rc19__py3-none-any.whl → 0.15.0rc21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc19.dist-info → acryl_datahub-0.15.0rc21.dist-info}/METADATA +2334 -2334
- {acryl_datahub-0.15.0rc19.dist-info → acryl_datahub-0.15.0rc21.dist-info}/RECORD +20 -20
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +56 -68
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/sink/datahub_rest.py +12 -1
- datahub/ingestion/source/dremio/dremio_api.py +193 -86
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +13 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +16 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +23 -0
- datahub/metadata/_schema_classes.py +400 -400
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/schema.avsc +17221 -17574
- {acryl_datahub-0.15.0rc19.dist-info → acryl_datahub-0.15.0rc21.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc19.dist-info → acryl_datahub-0.15.0rc21.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0rc19.dist-info → acryl_datahub-0.15.0rc21.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
|
+
import re
|
|
4
5
|
import warnings
|
|
5
6
|
from collections import defaultdict
|
|
6
7
|
from enum import Enum
|
|
@@ -609,32 +610,6 @@ class DremioAPIOperations:
|
|
|
609
610
|
|
|
610
611
|
return self.execute_query(query=jobs_query)
|
|
611
612
|
|
|
612
|
-
def get_source_by_id(self, source_id: str) -> Optional[Dict]:
|
|
613
|
-
"""
|
|
614
|
-
Fetch source details by ID.
|
|
615
|
-
"""
|
|
616
|
-
response = self.get(
|
|
617
|
-
url=f"/source/{source_id}",
|
|
618
|
-
)
|
|
619
|
-
return response if response else None
|
|
620
|
-
|
|
621
|
-
def get_source_for_dataset(self, schema: str, dataset: str) -> Optional[Dict]:
|
|
622
|
-
"""
|
|
623
|
-
Get source information for a dataset given its schema and name.
|
|
624
|
-
"""
|
|
625
|
-
dataset_id = self.get_dataset_id(schema, dataset)
|
|
626
|
-
if not dataset_id:
|
|
627
|
-
return None
|
|
628
|
-
|
|
629
|
-
catalog_entry = self.get(
|
|
630
|
-
url=f"/catalog/{dataset_id}",
|
|
631
|
-
)
|
|
632
|
-
if not catalog_entry or "path" not in catalog_entry:
|
|
633
|
-
return None
|
|
634
|
-
|
|
635
|
-
source_id = catalog_entry["path"][0]
|
|
636
|
-
return self.get_source_by_id(source_id)
|
|
637
|
-
|
|
638
613
|
def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
|
|
639
614
|
"""
|
|
640
615
|
Get Dremio tags for a given resource_id.
|
|
@@ -673,55 +648,119 @@ class DremioAPIOperations:
|
|
|
673
648
|
)
|
|
674
649
|
return None
|
|
675
650
|
|
|
676
|
-
def
|
|
677
|
-
self,
|
|
678
|
-
|
|
679
|
-
|
|
651
|
+
def _check_pattern_match(
|
|
652
|
+
self,
|
|
653
|
+
pattern: str,
|
|
654
|
+
paths: List[str],
|
|
655
|
+
allow_prefix: bool = True,
|
|
656
|
+
) -> bool:
|
|
657
|
+
"""
|
|
658
|
+
Helper method to check if a pattern matches any of the paths.
|
|
659
|
+
Handles hierarchical matching where each level is matched independently.
|
|
660
|
+
Also handles prefix matching for partial paths.
|
|
661
|
+
"""
|
|
662
|
+
if pattern == ".*":
|
|
663
|
+
return True
|
|
680
664
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
665
|
+
# Convert the pattern to regex with proper anchoring
|
|
666
|
+
regex_pattern = pattern
|
|
667
|
+
if pattern.startswith("^"):
|
|
668
|
+
# Already has start anchor
|
|
669
|
+
regex_pattern = pattern.replace(".", r"\.") # Escape dots
|
|
670
|
+
regex_pattern = regex_pattern.replace(
|
|
671
|
+
r"\.*", ".*"
|
|
672
|
+
) # Convert .* to wildcard
|
|
673
|
+
else:
|
|
674
|
+
# Add start anchor and handle dots
|
|
675
|
+
regex_pattern = "^" + pattern.replace(".", r"\.").replace(r"\.*", ".*")
|
|
676
|
+
|
|
677
|
+
# Handle end matching
|
|
678
|
+
if not pattern.endswith(".*"):
|
|
679
|
+
if pattern.endswith("$"):
|
|
680
|
+
# Keep explicit end anchor
|
|
681
|
+
pass
|
|
682
|
+
elif not allow_prefix:
|
|
683
|
+
# Add end anchor for exact matching
|
|
684
|
+
regex_pattern = regex_pattern + "$"
|
|
685
|
+
|
|
686
|
+
for path in paths:
|
|
687
|
+
if re.match(regex_pattern, path, re.IGNORECASE):
|
|
688
|
+
return True
|
|
697
689
|
|
|
698
|
-
|
|
699
|
-
if (
|
|
700
|
-
container.get("type")
|
|
701
|
-
== DremioEntityContainerType.CONTAINER.value
|
|
702
|
-
):
|
|
703
|
-
traverse_path(container.get("id"), container.get("path"))
|
|
690
|
+
return False
|
|
704
691
|
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
exc=exc,
|
|
713
|
-
)
|
|
692
|
+
def should_include_container(self, path: List[str], name: str) -> bool:
|
|
693
|
+
"""
|
|
694
|
+
Helper method to check if a container should be included based on schema patterns.
|
|
695
|
+
Used by both get_all_containers and get_containers_for_location.
|
|
696
|
+
"""
|
|
697
|
+
path_components = path + [name] if path else [name]
|
|
698
|
+
full_path = ".".join(path_components)
|
|
714
699
|
|
|
715
|
-
|
|
700
|
+
# Default allow everything case
|
|
701
|
+
if self.allow_schema_pattern == [".*"] and not self.deny_schema_pattern:
|
|
702
|
+
self.report.report_container_scanned(full_path)
|
|
703
|
+
return True
|
|
716
704
|
|
|
717
|
-
|
|
705
|
+
# Check deny patterns first
|
|
706
|
+
if self.deny_schema_pattern:
|
|
707
|
+
for pattern in self.deny_schema_pattern:
|
|
708
|
+
if self._check_pattern_match(
|
|
709
|
+
pattern=pattern,
|
|
710
|
+
paths=[full_path],
|
|
711
|
+
allow_prefix=False,
|
|
712
|
+
):
|
|
713
|
+
self.report.report_container_filtered(full_path)
|
|
714
|
+
return False
|
|
715
|
+
|
|
716
|
+
# Check allow patterns
|
|
717
|
+
for pattern in self.allow_schema_pattern:
|
|
718
|
+
# For patterns with wildcards, check if this path is a parent of the pattern
|
|
719
|
+
if "*" in pattern:
|
|
720
|
+
pattern_parts = pattern.split(".")
|
|
721
|
+
path_parts = path_components
|
|
722
|
+
|
|
723
|
+
# If pattern has exact same number of parts, check each component
|
|
724
|
+
if len(pattern_parts) == len(path_parts):
|
|
725
|
+
matches = True
|
|
726
|
+
for p_part, c_part in zip(pattern_parts, path_parts):
|
|
727
|
+
if p_part != "*" and p_part.lower() != c_part.lower():
|
|
728
|
+
matches = False
|
|
729
|
+
break
|
|
730
|
+
if matches:
|
|
731
|
+
self.report.report_container_scanned(full_path)
|
|
732
|
+
return True
|
|
733
|
+
# Otherwise check if current path is prefix match
|
|
734
|
+
else:
|
|
735
|
+
# Remove the trailing wildcard if present
|
|
736
|
+
if pattern_parts[-1] == "*":
|
|
737
|
+
pattern_parts = pattern_parts[:-1]
|
|
738
|
+
|
|
739
|
+
for i in range(len(path_parts)):
|
|
740
|
+
current_path = ".".join(path_parts[: i + 1])
|
|
741
|
+
pattern_prefix = ".".join(pattern_parts[: i + 1])
|
|
742
|
+
|
|
743
|
+
if pattern_prefix.startswith(current_path):
|
|
744
|
+
self.report.report_container_scanned(full_path)
|
|
745
|
+
return True
|
|
746
|
+
|
|
747
|
+
# Direct pattern matching
|
|
748
|
+
if self._check_pattern_match(
|
|
749
|
+
pattern=pattern,
|
|
750
|
+
paths=[full_path],
|
|
751
|
+
allow_prefix=True,
|
|
752
|
+
):
|
|
753
|
+
self.report.report_container_scanned(full_path)
|
|
754
|
+
return True
|
|
755
|
+
|
|
756
|
+
self.report.report_container_filtered(full_path)
|
|
757
|
+
return False
|
|
718
758
|
|
|
719
759
|
def get_all_containers(self):
|
|
720
760
|
"""
|
|
721
|
-
Query the Dremio sources API and return source information.
|
|
761
|
+
Query the Dremio sources API and return filtered source information.
|
|
722
762
|
"""
|
|
723
763
|
containers = []
|
|
724
|
-
|
|
725
764
|
response = self.get(url="/catalog")
|
|
726
765
|
|
|
727
766
|
def process_source(source):
|
|
@@ -731,34 +770,41 @@ class DremioAPIOperations:
|
|
|
731
770
|
)
|
|
732
771
|
|
|
733
772
|
source_config = source_resp.get("config", {})
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
773
|
+
db = source_config.get(
|
|
774
|
+
"database", source_config.get("databaseName", "")
|
|
775
|
+
)
|
|
776
|
+
|
|
777
|
+
if self.should_include_container([], source.get("path")[0]):
|
|
778
|
+
return {
|
|
779
|
+
"id": source.get("id"),
|
|
780
|
+
"name": source.get("path")[0],
|
|
781
|
+
"path": [],
|
|
782
|
+
"container_type": DremioEntityContainerType.SOURCE,
|
|
783
|
+
"source_type": source_resp.get("type"),
|
|
784
|
+
"root_path": source_config.get("rootPath"),
|
|
785
|
+
"database_name": db,
|
|
786
|
+
}
|
|
748
787
|
else:
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
788
|
+
if self.should_include_container([], source.get("path")[0]):
|
|
789
|
+
return {
|
|
790
|
+
"id": source.get("id"),
|
|
791
|
+
"name": source.get("path")[0],
|
|
792
|
+
"path": [],
|
|
793
|
+
"container_type": DremioEntityContainerType.SPACE,
|
|
794
|
+
}
|
|
795
|
+
return None
|
|
755
796
|
|
|
756
797
|
def process_source_and_containers(source):
|
|
757
798
|
container = process_source(source)
|
|
799
|
+
if not container:
|
|
800
|
+
return []
|
|
801
|
+
|
|
802
|
+
# Get sub-containers
|
|
758
803
|
sub_containers = self.get_containers_for_location(
|
|
759
804
|
resource_id=container.get("id"),
|
|
760
805
|
path=[container.get("name")],
|
|
761
806
|
)
|
|
807
|
+
|
|
762
808
|
return [container] + sub_containers
|
|
763
809
|
|
|
764
810
|
# Use ThreadPoolExecutor to parallelize the processing of sources
|
|
@@ -771,7 +817,16 @@ class DremioAPIOperations:
|
|
|
771
817
|
}
|
|
772
818
|
|
|
773
819
|
for future in concurrent.futures.as_completed(future_to_source):
|
|
774
|
-
|
|
820
|
+
source = future_to_source[future]
|
|
821
|
+
try:
|
|
822
|
+
containers.extend(future.result())
|
|
823
|
+
except Exception as exc:
|
|
824
|
+
logger.error(f"Error processing source: {exc}")
|
|
825
|
+
self.report.warning(
|
|
826
|
+
message="Failed to process source",
|
|
827
|
+
context=f"{source}",
|
|
828
|
+
exc=exc,
|
|
829
|
+
)
|
|
775
830
|
|
|
776
831
|
return containers
|
|
777
832
|
|
|
@@ -785,3 +840,55 @@ class DremioAPIOperations:
|
|
|
785
840
|
)
|
|
786
841
|
else:
|
|
787
842
|
return ""
|
|
843
|
+
|
|
844
|
+
def get_containers_for_location(
|
|
845
|
+
self, resource_id: str, path: List[str]
|
|
846
|
+
) -> List[Dict[str, str]]:
|
|
847
|
+
containers = []
|
|
848
|
+
|
|
849
|
+
def traverse_path(location_id: str, entity_path: List[str]) -> List:
|
|
850
|
+
nonlocal containers
|
|
851
|
+
try:
|
|
852
|
+
response = self.get(url=f"/catalog/{location_id}")
|
|
853
|
+
|
|
854
|
+
# Check if current folder should be included
|
|
855
|
+
if (
|
|
856
|
+
response.get("entityType")
|
|
857
|
+
== DremioEntityContainerType.FOLDER.value.lower()
|
|
858
|
+
):
|
|
859
|
+
folder_name = entity_path[-1]
|
|
860
|
+
folder_path = entity_path[:-1]
|
|
861
|
+
|
|
862
|
+
if self.should_include_container(folder_path, folder_name):
|
|
863
|
+
containers.append(
|
|
864
|
+
{
|
|
865
|
+
"id": location_id,
|
|
866
|
+
"name": folder_name,
|
|
867
|
+
"path": folder_path,
|
|
868
|
+
"container_type": DremioEntityContainerType.FOLDER,
|
|
869
|
+
}
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
# Recursively process child containers
|
|
873
|
+
for container in response.get("children", []):
|
|
874
|
+
if (
|
|
875
|
+
container.get("type")
|
|
876
|
+
== DremioEntityContainerType.CONTAINER.value
|
|
877
|
+
):
|
|
878
|
+
traverse_path(container.get("id"), container.get("path"))
|
|
879
|
+
|
|
880
|
+
except Exception as exc:
|
|
881
|
+
logging.info(
|
|
882
|
+
"Location {} contains no tables or views. Skipping...".format(
|
|
883
|
+
location_id
|
|
884
|
+
)
|
|
885
|
+
)
|
|
886
|
+
self.report.warning(
|
|
887
|
+
message="Failed to get tables or views",
|
|
888
|
+
context=f"{location_id}",
|
|
889
|
+
exc=exc,
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
return containers
|
|
893
|
+
|
|
894
|
+
return traverse_path(location_id=resource_id, entity_path=path)
|
|
@@ -31,6 +31,7 @@ class DremioToDataHubSourceTypeMapping:
|
|
|
31
31
|
"SNOWFLAKE": "snowflake",
|
|
32
32
|
"SYNAPSE": "mssql",
|
|
33
33
|
"TERADATA": "teradata",
|
|
34
|
+
"VERTICA": "vertica",
|
|
34
35
|
}
|
|
35
36
|
|
|
36
37
|
DATABASE_SOURCE_TYPES = {
|
|
@@ -52,6 +53,7 @@ class DremioToDataHubSourceTypeMapping:
|
|
|
52
53
|
"SNOWFLAKE",
|
|
53
54
|
"SYNAPSE",
|
|
54
55
|
"TERADATA",
|
|
56
|
+
"VERTICA",
|
|
55
57
|
}
|
|
56
58
|
|
|
57
59
|
FILE_OBJECT_STORAGE_TYPES = {
|
|
@@ -14,12 +14,27 @@ class DremioSourceReport(
|
|
|
14
14
|
):
|
|
15
15
|
num_containers_failed: int = 0
|
|
16
16
|
num_datasets_failed: int = 0
|
|
17
|
+
containers_scanned: int = 0
|
|
18
|
+
containers_filtered: int = 0
|
|
17
19
|
|
|
18
20
|
def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
|
|
19
21
|
# recording total combined latency is not very useful, keeping this method as a placeholder
|
|
20
22
|
# for future implementation of min / max / percentiles etc.
|
|
21
23
|
pass
|
|
22
24
|
|
|
25
|
+
def report_container_scanned(self, name: str) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Record that a container was successfully scanned
|
|
28
|
+
"""
|
|
29
|
+
self.containers_scanned += 1
|
|
30
|
+
|
|
31
|
+
def report_container_filtered(self, container_name: str) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Record that a container was filtered out
|
|
34
|
+
"""
|
|
35
|
+
self.containers_filtered += 1
|
|
36
|
+
self.report_dropped(container_name)
|
|
37
|
+
|
|
23
38
|
def report_entity_scanned(self, name: str, ent_type: str = "View") -> None:
|
|
24
39
|
"""
|
|
25
40
|
Entity could be a view or a table
|
|
@@ -282,10 +282,6 @@ class ConfluentJDBCSourceConnector:
|
|
|
282
282
|
query: str
|
|
283
283
|
transforms: list
|
|
284
284
|
|
|
285
|
-
def report_warning(self, key: str, reason: str) -> None:
|
|
286
|
-
logger.warning(f"{key}: {reason}")
|
|
287
|
-
self.report.report_warning(key, reason)
|
|
288
|
-
|
|
289
285
|
def get_parser(
|
|
290
286
|
self,
|
|
291
287
|
connector_manifest: ConnectorManifest,
|
|
@@ -355,9 +351,9 @@ class ConfluentJDBCSourceConnector:
|
|
|
355
351
|
source_table = f"{table_name_tuple[-2]}.{source_table}"
|
|
356
352
|
else:
|
|
357
353
|
include_source_dataset = False
|
|
358
|
-
self.
|
|
359
|
-
|
|
360
|
-
f"
|
|
354
|
+
self.report.warning(
|
|
355
|
+
"Could not find schema for table"
|
|
356
|
+
f"{self.connector_manifest.name} : {source_table}",
|
|
361
357
|
)
|
|
362
358
|
dataset_name: str = get_dataset_name(database_name, source_table)
|
|
363
359
|
lineage = KafkaConnectLineage(
|
|
@@ -457,9 +453,9 @@ class ConfluentJDBCSourceConnector:
|
|
|
457
453
|
target_platform=KAFKA,
|
|
458
454
|
)
|
|
459
455
|
lineages.append(lineage)
|
|
460
|
-
self.
|
|
456
|
+
self.report.warning(
|
|
457
|
+
"Could not find input dataset, the connector has query configuration set",
|
|
461
458
|
self.connector_manifest.name,
|
|
462
|
-
"could not find input dataset, the connector has query configuration set",
|
|
463
459
|
)
|
|
464
460
|
self.connector_manifest.lineages = lineages
|
|
465
461
|
return
|
|
@@ -535,24 +531,24 @@ class ConfluentJDBCSourceConnector:
|
|
|
535
531
|
include_source_dataset=False,
|
|
536
532
|
)
|
|
537
533
|
)
|
|
538
|
-
self.
|
|
539
|
-
|
|
540
|
-
f"
|
|
534
|
+
self.report.warning(
|
|
535
|
+
"Could not find input dataset for connector topics",
|
|
536
|
+
f"{self.connector_manifest.name} : {topic_names}",
|
|
541
537
|
)
|
|
542
538
|
self.connector_manifest.lineages = lineages
|
|
543
539
|
return
|
|
544
540
|
else:
|
|
545
541
|
include_source_dataset = True
|
|
546
542
|
if SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
547
|
-
self.
|
|
548
|
-
|
|
549
|
-
f"
|
|
543
|
+
self.report.warning(
|
|
544
|
+
"Could not find input dataset, connector has unknown transform",
|
|
545
|
+
f"{self.connector_manifest.name} : {transforms[0]['type']}",
|
|
550
546
|
)
|
|
551
547
|
include_source_dataset = False
|
|
552
548
|
if not SINGLE_TRANSFORM and UNKNOWN_TRANSFORM:
|
|
553
|
-
self.
|
|
549
|
+
self.report.warning(
|
|
550
|
+
"Could not find input dataset, connector has one or more unknown transforms",
|
|
554
551
|
self.connector_manifest.name,
|
|
555
|
-
"could not find input dataset, connector has one or more unknown transforms",
|
|
556
552
|
)
|
|
557
553
|
include_source_dataset = False
|
|
558
554
|
lineages = self.default_get_lineages(
|
|
@@ -753,8 +749,10 @@ class DebeziumSourceConnector:
|
|
|
753
749
|
lineages.append(lineage)
|
|
754
750
|
self.connector_manifest.lineages = lineages
|
|
755
751
|
except Exception as e:
|
|
756
|
-
self.report.
|
|
757
|
-
|
|
752
|
+
self.report.warning(
|
|
753
|
+
"Error resolving lineage for connector",
|
|
754
|
+
self.connector_manifest.name,
|
|
755
|
+
exc=e,
|
|
758
756
|
)
|
|
759
757
|
|
|
760
758
|
return
|
|
@@ -783,10 +781,6 @@ class BigQuerySinkConnector:
|
|
|
783
781
|
defaultDataset: Optional[str] = None
|
|
784
782
|
version: str = "v1"
|
|
785
783
|
|
|
786
|
-
def report_warning(self, key: str, reason: str) -> None:
|
|
787
|
-
logger.warning(f"{key}: {reason}")
|
|
788
|
-
self.report.report_warning(key, reason)
|
|
789
|
-
|
|
790
784
|
def get_parser(
|
|
791
785
|
self,
|
|
792
786
|
connector_manifest: ConnectorManifest,
|
|
@@ -917,9 +911,9 @@ class BigQuerySinkConnector:
|
|
|
917
911
|
transformed_topic = self.apply_transformations(topic, transforms)
|
|
918
912
|
dataset_table = self.get_dataset_table_for_topic(transformed_topic, parser)
|
|
919
913
|
if dataset_table is None:
|
|
920
|
-
self.
|
|
921
|
-
|
|
922
|
-
f"
|
|
914
|
+
self.report.warning(
|
|
915
|
+
"Could not find target dataset for topic, please check your connector configuration"
|
|
916
|
+
f"{self.connector_manifest.name} : {transformed_topic} ",
|
|
923
917
|
)
|
|
924
918
|
continue
|
|
925
919
|
target_dataset = f"{project}.{dataset_table}"
|
|
@@ -954,10 +948,6 @@ class SnowflakeSinkConnector:
|
|
|
954
948
|
schema_name: str
|
|
955
949
|
topics_to_tables: Dict[str, str]
|
|
956
950
|
|
|
957
|
-
def report_warning(self, key: str, reason: str) -> None:
|
|
958
|
-
logger.warning(f"{key}: {reason}")
|
|
959
|
-
self.report.report_warning(key, reason)
|
|
960
|
-
|
|
961
951
|
def get_table_name_from_topic_name(self, topic_name: str) -> str:
|
|
962
952
|
"""
|
|
963
953
|
This function converts the topic name to a valid Snowflake table name using some rules.
|
|
@@ -1105,8 +1095,10 @@ class ConfluentS3SinkConnector:
|
|
|
1105
1095
|
)
|
|
1106
1096
|
self.connector_manifest.lineages = lineages
|
|
1107
1097
|
except Exception as e:
|
|
1108
|
-
self.report.
|
|
1109
|
-
|
|
1098
|
+
self.report.warning(
|
|
1099
|
+
"Error resolving lineage for connector",
|
|
1100
|
+
self.connector_manifest.name,
|
|
1101
|
+
exc=e,
|
|
1110
1102
|
)
|
|
1111
1103
|
|
|
1112
1104
|
return
|
|
@@ -1155,7 +1147,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1155
1147
|
)
|
|
1156
1148
|
self.session.auth = (self.config.username, self.config.password)
|
|
1157
1149
|
|
|
1158
|
-
test_response = self.session.get(f"{self.config.connect_uri}")
|
|
1150
|
+
test_response = self.session.get(f"{self.config.connect_uri}/connectors")
|
|
1159
1151
|
test_response.raise_for_status()
|
|
1160
1152
|
logger.info(f"Connection to {self.config.connect_uri} is ok")
|
|
1161
1153
|
if not jpype.isJVMStarted():
|
|
@@ -1178,13 +1170,16 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1178
1170
|
|
|
1179
1171
|
payload = connector_response.json()
|
|
1180
1172
|
|
|
1181
|
-
for
|
|
1182
|
-
connector_url = f"{self.config.connect_uri}/connectors/{
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
if
|
|
1187
|
-
|
|
1173
|
+
for connector_name in payload:
|
|
1174
|
+
connector_url = f"{self.config.connect_uri}/connectors/{connector_name}"
|
|
1175
|
+
connector_manifest = self._get_connector_manifest(
|
|
1176
|
+
connector_name, connector_url
|
|
1177
|
+
)
|
|
1178
|
+
if (
|
|
1179
|
+
connector_manifest is None
|
|
1180
|
+
or not self.config.connector_patterns.allowed(connector_manifest.name)
|
|
1181
|
+
):
|
|
1182
|
+
self.report.report_dropped(connector_name)
|
|
1188
1183
|
continue
|
|
1189
1184
|
|
|
1190
1185
|
if self.config.provided_configs:
|
|
@@ -1195,19 +1190,11 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1195
1190
|
connector_manifest.lineages = list()
|
|
1196
1191
|
connector_manifest.url = connector_url
|
|
1197
1192
|
|
|
1198
|
-
|
|
1199
|
-
f"{self.config.connect_uri}/connectors/{c}/topics",
|
|
1200
|
-
).json()
|
|
1201
|
-
|
|
1202
|
-
connector_manifest.topic_names = topics[c]["topics"]
|
|
1193
|
+
connector_manifest.topic_names = self._get_connector_topics(connector_name)
|
|
1203
1194
|
|
|
1204
1195
|
# Populate Source Connector metadata
|
|
1205
1196
|
if connector_manifest.type == SOURCE:
|
|
1206
|
-
tasks = self.
|
|
1207
|
-
f"{self.config.connect_uri}/connectors/{c}/tasks",
|
|
1208
|
-
).json()
|
|
1209
|
-
|
|
1210
|
-
connector_manifest.tasks = tasks
|
|
1197
|
+
connector_manifest.tasks = self._get_connector_tasks(connector_name)
|
|
1211
1198
|
|
|
1212
1199
|
# JDBC source connector lineages
|
|
1213
1200
|
if connector_manifest.config.get(CONNECTOR_CLASS).__eq__(
|
|
@@ -1246,7 +1233,7 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1246
1233
|
)
|
|
1247
1234
|
continue
|
|
1248
1235
|
|
|
1249
|
-
for topic in
|
|
1236
|
+
for topic in connector_manifest.topic_names:
|
|
1250
1237
|
lineage = KafkaConnectLineage(
|
|
1251
1238
|
source_dataset=target_connector.source_dataset,
|
|
1252
1239
|
source_platform=target_connector.source_platform,
|
|
@@ -1286,6 +1273,49 @@ class KafkaConnectSource(StatefulIngestionSourceBase):
|
|
|
1286
1273
|
|
|
1287
1274
|
return connectors_manifest
|
|
1288
1275
|
|
|
1276
|
+
def _get_connector_manifest(
|
|
1277
|
+
self, connector_name: str, connector_url: str
|
|
1278
|
+
) -> Optional[ConnectorManifest]:
|
|
1279
|
+
try:
|
|
1280
|
+
connector_response = self.session.get(connector_url)
|
|
1281
|
+
connector_response.raise_for_status()
|
|
1282
|
+
except Exception as e:
|
|
1283
|
+
self.report.warning(
|
|
1284
|
+
"Failed to get connector details", connector_name, exc=e
|
|
1285
|
+
)
|
|
1286
|
+
return None
|
|
1287
|
+
manifest = connector_response.json()
|
|
1288
|
+
connector_manifest = ConnectorManifest(**manifest)
|
|
1289
|
+
return connector_manifest
|
|
1290
|
+
|
|
1291
|
+
def _get_connector_tasks(self, connector_name: str) -> dict:
|
|
1292
|
+
try:
|
|
1293
|
+
response = self.session.get(
|
|
1294
|
+
f"{self.config.connect_uri}/connectors/{connector_name}/tasks",
|
|
1295
|
+
)
|
|
1296
|
+
response.raise_for_status()
|
|
1297
|
+
except Exception as e:
|
|
1298
|
+
self.report.warning(
|
|
1299
|
+
"Error getting connector tasks", context=connector_name, exc=e
|
|
1300
|
+
)
|
|
1301
|
+
return {}
|
|
1302
|
+
|
|
1303
|
+
return response.json()
|
|
1304
|
+
|
|
1305
|
+
def _get_connector_topics(self, connector_name: str) -> List[str]:
|
|
1306
|
+
try:
|
|
1307
|
+
response = self.session.get(
|
|
1308
|
+
f"{self.config.connect_uri}/connectors/{connector_name}/topics",
|
|
1309
|
+
)
|
|
1310
|
+
response.raise_for_status()
|
|
1311
|
+
except Exception as e:
|
|
1312
|
+
self.report.warning(
|
|
1313
|
+
"Error getting connector topics", context=connector_name, exc=e
|
|
1314
|
+
)
|
|
1315
|
+
return []
|
|
1316
|
+
|
|
1317
|
+
return response.json()[connector_name]["topics"]
|
|
1318
|
+
|
|
1289
1319
|
def construct_flow_workunit(self, connector: ConnectorManifest) -> MetadataWorkUnit:
|
|
1290
1320
|
connector_name = connector.name
|
|
1291
1321
|
connector_type = connector.type
|
|
@@ -413,9 +413,10 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
413
413
|
return UpstreamLineageEdge.parse_obj(db_row)
|
|
414
414
|
except Exception as e:
|
|
415
415
|
self.report.num_upstream_lineage_edge_parsing_failed += 1
|
|
416
|
+
upstream_tables = db_row.get("UPSTREAM_TABLES")
|
|
416
417
|
self.structured_reporter.warning(
|
|
417
418
|
"Failed to parse lineage edge",
|
|
418
|
-
context=db_row.get(
|
|
419
|
+
context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}",
|
|
419
420
|
exc=e,
|
|
420
421
|
)
|
|
421
422
|
return None
|
|
@@ -237,6 +237,19 @@ SHOW VIEWS IN DATABASE "{db_name}"
|
|
|
237
237
|
LIMIT {limit} {from_clause};
|
|
238
238
|
"""
|
|
239
239
|
|
|
240
|
+
@staticmethod
|
|
241
|
+
def get_secure_view_definitions() -> str:
|
|
242
|
+
# https://docs.snowflake.com/en/sql-reference/account-usage/views
|
|
243
|
+
return """
|
|
244
|
+
SELECT
|
|
245
|
+
TABLE_CATALOG as "TABLE_CATALOG",
|
|
246
|
+
TABLE_SCHEMA as "TABLE_SCHEMA",
|
|
247
|
+
TABLE_NAME as "TABLE_NAME",
|
|
248
|
+
VIEW_DEFINITION as "VIEW_DEFINITION"
|
|
249
|
+
FROM SNOWFLAKE.ACCOUNT_USAGE.VIEWS
|
|
250
|
+
WHERE IS_SECURE = 'YES' AND VIEW_DEFINITION !='' AND DELETED IS NULL
|
|
251
|
+
"""
|
|
252
|
+
|
|
240
253
|
@staticmethod
|
|
241
254
|
def columns_for_schema(
|
|
242
255
|
schema_name: str,
|