acryl-datahub 0.15.0rc18__py3-none-any.whl → 0.15.0rc20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=IT3uFaiJFLl5DUkIwM1qTDLAORKdv0y2E2XTiSL4TyM,575
1
+ datahub/__init__.py,sha256=fYgu28dsndrekGv9Pq_ENw7G6Erm7qtsY5H6W3cKFDU,575
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -282,13 +282,13 @@ datahub/ingestion/source/delta_lake/delta_lake_utils.py,sha256=VqIDPEXepOnlk4oWM
282
282
  datahub/ingestion/source/delta_lake/report.py,sha256=uqWWivPltlZ7dwpOOluTvHOKKsSusqihn67clCAwxoM,467
283
283
  datahub/ingestion/source/delta_lake/source.py,sha256=jLCN6SeAv3bCD4w4ZDw15eIbFF3yVWcxVtBklovFEBg,13548
284
284
  datahub/ingestion/source/dremio/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
285
- datahub/ingestion/source/dremio/dremio_api.py,sha256=R7HLqAg845SdX4zWhl2Tm8AtxaFpUIX_zxRRvap2uCQ,28998
285
+ datahub/ingestion/source/dremio/dremio_api.py,sha256=am8o_mQq7zteI4zasnkRb9B9-_BFrchTIA_oJkqRagA,33470
286
286
  datahub/ingestion/source/dremio/dremio_aspects.py,sha256=3VeHzCw9q1ytngmsq_K4Ll9tWD2V8EDFySBImHdhPAw,18287
287
287
  datahub/ingestion/source/dremio/dremio_config.py,sha256=5SP66ewGYN0OnyWgpU33EZOmtICsclTtBX5DSYLwl3c,5782
288
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py,sha256=YkYC3-TB-Jn65z2GN_NMErQDovwU7krQ9b92DBh4uvY,3021
288
+ datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py,sha256=-Fefw59tXR6QA8ifOz_mieDccMMG_vyQgp7j-BaXFHQ,3070
289
289
  datahub/ingestion/source/dremio/dremio_entities.py,sha256=3H3vIvj5ab4d8gmB9-rbZfwRgW87gT1DdjWiMjNgqJ4,15069
290
290
  datahub/ingestion/source/dremio/dremio_profiling.py,sha256=TAcnpo8ZRKhLDHnQSJzJg3YdwTSyEa73LUAzENs7wG4,12287
291
- datahub/ingestion/source/dremio/dremio_reporting.py,sha256=IPgv7lOnhK6mQeqwRsPscKnXhzgVZG8Id3yNcsmG7nw,1273
291
+ datahub/ingestion/source/dremio/dremio_reporting.py,sha256=pYyEOAxiotyVySumY85Ql8vtGsne7B9sDrdqeVFnWLQ,1742
292
292
  datahub/ingestion/source/dremio/dremio_source.py,sha256=NJxDXWd19A3MPplPiLPAjxTmjeJBA04PcPytRSslmYQ,26323
293
293
  datahub/ingestion/source/dremio/dremio_sql_queries.py,sha256=W0rcXawlwJOHNYr5o73rilMijtFOO3cVkn6pY-JLc6o,8186
294
294
  datahub/ingestion/source/dynamodb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -438,7 +438,7 @@ datahub/ingestion/source/snowflake/snowflake_shares.py,sha256=ud3Ah4qHrmSfpD8Od-
438
438
  datahub/ingestion/source/snowflake/snowflake_summary.py,sha256=kTmuCtRnvHqM8WBYhWeK4XafJq3ssFL9kcS03jEeWT4,5506
439
439
  datahub/ingestion/source/snowflake/snowflake_tag.py,sha256=fyfWmFVz2WZrpTJWNIe9m0WpDHgeFrGPf8diORJZUwo,6212
440
440
  datahub/ingestion/source/snowflake/snowflake_usage_v2.py,sha256=PEmYNMXJRUvLQmVd8juVqjokfuSPuH9ppcM0ruXamxA,24807
441
- datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=443P7t839_iRymWMIg-dd7to21smsazS110UKEYbpEU,12588
441
+ datahub/ingestion/source/snowflake/snowflake_utils.py,sha256=YczNEupY89jeegjR2_1pT4bPi9wQ69EIhGpzyCe9Jdg,12600
442
442
  datahub/ingestion/source/snowflake/snowflake_v2.py,sha256=lo_3asTuIZbF-LuEUcYL-9NIZ720n7oB9mYA6WVTWA4,31960
443
443
  datahub/ingestion/source/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
444
444
  datahub/ingestion/source/sql/athena.py,sha256=G3cIY8H_76lIUAzQWW2kLnZOEsfbakmojxbiHb3dYZ8,24059
@@ -560,7 +560,7 @@ datahub/lite/lite_server.py,sha256=p9Oa2nNs65mqcssSIVOr7VOzWqfVstz6ZQEdT4f82S0,1
560
560
  datahub/lite/lite_util.py,sha256=pgBpT3vTO1YCQ2njZRNyicSkHYeEmQCt41BaXU8WvMo,4503
561
561
  datahub/metadata/__init__.py,sha256=AjhXPjI6cnpdcrBRrE5gOWo15vv2TTl2ctU4UAnUN7A,238
562
562
  datahub/metadata/_schema_classes.py,sha256=iPeBXGvbNEm0vw5pYwunnvx7bTtBdmIQVtzMOlS6bSI,955042
563
- datahub/metadata/schema.avsc,sha256=wMMSgx3OtzD1tNTC4dh_PFBZrnco21i2jO5J7oy2PgE,677545
563
+ datahub/metadata/schema.avsc,sha256=Xx93OdPzQfBb2CtntIYE-HAeKNg-JZcCtRU95v7ZZCs,677728
564
564
  datahub/metadata/schema_classes.py,sha256=X5Jl5EaSxyHdXOQv14pJ5WkQALun4MRpJ4q12wVFE18,1299
565
565
  datahub/metadata/urns.py,sha256=nfrCTExR-k2P9w272WVtWSN3xW1VUJngPwP3xnvULjU,1217
566
566
  datahub/metadata/_urns/__init__.py,sha256=cOF3GHMDgPhmbLKbN02NPpuLGHSu0qNgQyBRv08eqF0,243
@@ -757,7 +757,7 @@ datahub/metadata/schemas/ExecutionRequestKey.avsc,sha256=SvjnlTAGYsSnvVE0rZ9-7UP
757
757
  datahub/metadata/schemas/ExecutionRequestResult.avsc,sha256=kg3xMNr9kYLPnFsV-iqcGm1sh1muQVGJvxUt15L1yKo,2333
758
758
  datahub/metadata/schemas/ExecutionRequestSignal.avsc,sha256=dsIUa6tfVSXqYOgh4cW6_Hzi8RjHuJJoO-mBAuZukpA,2515
759
759
  datahub/metadata/schemas/Filter.avsc,sha256=PU-aGkc2-sI3ZXY7ci-Y0A7zp1jux3VW_6c8MJRAokg,5933
760
- datahub/metadata/schemas/FormInfo.avsc,sha256=Gol1Qh6eB5lYvAOXBx5k2eTtyolGI8n0o_Cv_pxX40E,6095
760
+ datahub/metadata/schemas/FormInfo.avsc,sha256=FbN34htiCgm3LqKDL3sVsJhMUHIyc5jYpGJtYm7Ysd4,6270
761
761
  datahub/metadata/schemas/FormKey.avsc,sha256=1-wE28B8T3WJ3JtexreNtFvP3To3n7U-jvYudCuSM9o,437
762
762
  datahub/metadata/schemas/Forms.avsc,sha256=shmkhRoHN2gTaTsqGrGDRoNwe_z-nrFbbLjH9MtVDCs,10955
763
763
  datahub/metadata/schemas/GlobalSettingsInfo.avsc,sha256=OVMM6FwhHhufHkezYcVePK0zI2llzFYLVFJhmAiHoiI,10102
@@ -869,11 +869,11 @@ datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn
869
869
  datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
870
870
  datahub/sql_parsing/schema_resolver.py,sha256=9INZWdxA2dMSLK6RXaVqjbjyLY_VKMhCkQv_Xd6Ln3I,10848
871
871
  datahub/sql_parsing/split_statements.py,sha256=uZhAXLaRxDfmK0lPBW2oM_YVdJfSMhdgndnfd9iIXuA,5001
872
- datahub/sql_parsing/sql_parsing_aggregator.py,sha256=gLelf5l73EufB8qijb9ZDLANkt4o05schGg4DY-bOJs,69937
872
+ datahub/sql_parsing/sql_parsing_aggregator.py,sha256=F-aj7yqOwbo7FpxduFO5a7cLWkojL_Npv3_dlfHPNGY,69877
873
873
  datahub/sql_parsing/sql_parsing_common.py,sha256=h_V_m54hJ9EUh5kczq7cYOIeNeo4bgf0Px0H-Nq-UIg,2602
874
874
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
875
875
  datahub/sql_parsing/sqlglot_lineage.py,sha256=CLDOc0HNqL_539eahOP3QOoldIYC6CF29id4Xe3TlEM,47018
876
- datahub/sql_parsing/sqlglot_utils.py,sha256=8MYzkyekhup3ihVStRPuwneWPNu17xhBg5SG8iVfFRY,14431
876
+ datahub/sql_parsing/sqlglot_utils.py,sha256=n6yufzEGwSlFeCSU540hEldIuab0q8KGqm9x0vSawkc,14699
877
877
  datahub/sql_parsing/tool_meta_extractor.py,sha256=pE-pkRKBfNTXEJkaQM9NlG807mc-X6OtetgskJySCs8,2908
878
878
  datahub/telemetry/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
879
879
  datahub/telemetry/stats.py,sha256=YltbtC3fe6rl1kcxn1A-mSnVpECTPm5k-brrUt7QxTI,967
@@ -974,8 +974,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
974
974
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
975
975
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
976
976
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
977
- acryl_datahub-0.15.0rc18.dist-info/METADATA,sha256=qrcInf1ZskftVhU7CPnCFSK96c0Klv1F-pLn70wszLI,173559
978
- acryl_datahub-0.15.0rc18.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
979
- acryl_datahub-0.15.0rc18.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
980
- acryl_datahub-0.15.0rc18.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
981
- acryl_datahub-0.15.0rc18.dist-info/RECORD,,
977
+ acryl_datahub-0.15.0rc20.dist-info/METADATA,sha256=KuTZA5lnEW-UAvSPqqkBsDFKkwlJF8WzYbcphVMW_aE,173559
978
+ acryl_datahub-0.15.0rc20.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
979
+ acryl_datahub-0.15.0rc20.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
980
+ acryl_datahub-0.15.0rc20.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
981
+ acryl_datahub-0.15.0rc20.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0rc18"
6
+ __version__ = "0.15.0rc20"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
@@ -1,6 +1,7 @@
1
1
  import concurrent.futures
2
2
  import json
3
3
  import logging
4
+ import re
4
5
  import warnings
5
6
  from collections import defaultdict
6
7
  from enum import Enum
@@ -609,32 +610,6 @@ class DremioAPIOperations:
609
610
 
610
611
  return self.execute_query(query=jobs_query)
611
612
 
612
- def get_source_by_id(self, source_id: str) -> Optional[Dict]:
613
- """
614
- Fetch source details by ID.
615
- """
616
- response = self.get(
617
- url=f"/source/{source_id}",
618
- )
619
- return response if response else None
620
-
621
- def get_source_for_dataset(self, schema: str, dataset: str) -> Optional[Dict]:
622
- """
623
- Get source information for a dataset given its schema and name.
624
- """
625
- dataset_id = self.get_dataset_id(schema, dataset)
626
- if not dataset_id:
627
- return None
628
-
629
- catalog_entry = self.get(
630
- url=f"/catalog/{dataset_id}",
631
- )
632
- if not catalog_entry or "path" not in catalog_entry:
633
- return None
634
-
635
- source_id = catalog_entry["path"][0]
636
- return self.get_source_by_id(source_id)
637
-
638
613
  def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
639
614
  """
640
615
  Get Dremio tags for a given resource_id.
@@ -673,55 +648,119 @@ class DremioAPIOperations:
673
648
  )
674
649
  return None
675
650
 
676
- def get_containers_for_location(
677
- self, resource_id: str, path: List[str]
678
- ) -> List[Dict[str, str]]:
679
- containers = []
651
+ def _check_pattern_match(
652
+ self,
653
+ pattern: str,
654
+ paths: List[str],
655
+ allow_prefix: bool = True,
656
+ ) -> bool:
657
+ """
658
+ Helper method to check if a pattern matches any of the paths.
659
+ Handles hierarchical matching where each level is matched independently.
660
+ Also handles prefix matching for partial paths.
661
+ """
662
+ if pattern == ".*":
663
+ return True
680
664
 
681
- def traverse_path(location_id: str, entity_path: List[str]) -> List:
682
- nonlocal containers
683
- try:
684
- response = self.get(url=f"/catalog/{location_id}")
685
- if (
686
- response.get("entityType")
687
- == DremioEntityContainerType.FOLDER.value.lower()
688
- ):
689
- containers.append(
690
- {
691
- "id": location_id,
692
- "name": entity_path[-1],
693
- "path": entity_path[:-1],
694
- "container_type": DremioEntityContainerType.FOLDER,
695
- }
696
- )
665
+ # Convert the pattern to regex with proper anchoring
666
+ regex_pattern = pattern
667
+ if pattern.startswith("^"):
668
+ # Already has start anchor
669
+ regex_pattern = pattern.replace(".", r"\.") # Escape dots
670
+ regex_pattern = regex_pattern.replace(
671
+ r"\.*", ".*"
672
+ ) # Convert .* to wildcard
673
+ else:
674
+ # Add start anchor and handle dots
675
+ regex_pattern = "^" + pattern.replace(".", r"\.").replace(r"\.*", ".*")
676
+
677
+ # Handle end matching
678
+ if not pattern.endswith(".*"):
679
+ if pattern.endswith("$"):
680
+ # Keep explicit end anchor
681
+ pass
682
+ elif not allow_prefix:
683
+ # Add end anchor for exact matching
684
+ regex_pattern = regex_pattern + "$"
685
+
686
+ for path in paths:
687
+ if re.match(regex_pattern, path, re.IGNORECASE):
688
+ return True
697
689
 
698
- for container in response.get("children", []):
699
- if (
700
- container.get("type")
701
- == DremioEntityContainerType.CONTAINER.value
702
- ):
703
- traverse_path(container.get("id"), container.get("path"))
690
+ return False
704
691
 
705
- except Exception as exc:
706
- logging.info(
707
- "Location {} contains no tables or views. Skipping...".format(id)
708
- )
709
- self.report.warning(
710
- message="Failed to get tables or views",
711
- context=f"{id}",
712
- exc=exc,
713
- )
692
+ def should_include_container(self, path: List[str], name: str) -> bool:
693
+ """
694
+ Helper method to check if a container should be included based on schema patterns.
695
+ Used by both get_all_containers and get_containers_for_location.
696
+ """
697
+ path_components = path + [name] if path else [name]
698
+ full_path = ".".join(path_components)
714
699
 
715
- return containers
700
+ # Default allow everything case
701
+ if self.allow_schema_pattern == [".*"] and not self.deny_schema_pattern:
702
+ self.report.report_container_scanned(full_path)
703
+ return True
716
704
 
717
- return traverse_path(location_id=resource_id, entity_path=path)
705
+ # Check deny patterns first
706
+ if self.deny_schema_pattern:
707
+ for pattern in self.deny_schema_pattern:
708
+ if self._check_pattern_match(
709
+ pattern=pattern,
710
+ paths=[full_path],
711
+ allow_prefix=False,
712
+ ):
713
+ self.report.report_container_filtered(full_path)
714
+ return False
715
+
716
+ # Check allow patterns
717
+ for pattern in self.allow_schema_pattern:
718
+ # For patterns with wildcards, check if this path is a parent of the pattern
719
+ if "*" in pattern:
720
+ pattern_parts = pattern.split(".")
721
+ path_parts = path_components
722
+
723
+ # If pattern has exact same number of parts, check each component
724
+ if len(pattern_parts) == len(path_parts):
725
+ matches = True
726
+ for p_part, c_part in zip(pattern_parts, path_parts):
727
+ if p_part != "*" and p_part.lower() != c_part.lower():
728
+ matches = False
729
+ break
730
+ if matches:
731
+ self.report.report_container_scanned(full_path)
732
+ return True
733
+ # Otherwise check if current path is prefix match
734
+ else:
735
+ # Remove the trailing wildcard if present
736
+ if pattern_parts[-1] == "*":
737
+ pattern_parts = pattern_parts[:-1]
738
+
739
+ for i in range(len(path_parts)):
740
+ current_path = ".".join(path_parts[: i + 1])
741
+ pattern_prefix = ".".join(pattern_parts[: i + 1])
742
+
743
+ if pattern_prefix.startswith(current_path):
744
+ self.report.report_container_scanned(full_path)
745
+ return True
746
+
747
+ # Direct pattern matching
748
+ if self._check_pattern_match(
749
+ pattern=pattern,
750
+ paths=[full_path],
751
+ allow_prefix=True,
752
+ ):
753
+ self.report.report_container_scanned(full_path)
754
+ return True
755
+
756
+ self.report.report_container_filtered(full_path)
757
+ return False
718
758
 
719
759
  def get_all_containers(self):
720
760
  """
721
- Query the Dremio sources API and return source information.
761
+ Query the Dremio sources API and return filtered source information.
722
762
  """
723
763
  containers = []
724
-
725
764
  response = self.get(url="/catalog")
726
765
 
727
766
  def process_source(source):
@@ -731,34 +770,41 @@ class DremioAPIOperations:
731
770
  )
732
771
 
733
772
  source_config = source_resp.get("config", {})
734
- if source_config.get("database"):
735
- db = source_config.get("database")
736
- else:
737
- db = source_config.get("databaseName", "")
738
-
739
- return {
740
- "id": source.get("id"),
741
- "name": source.get("path")[0],
742
- "path": [],
743
- "container_type": DremioEntityContainerType.SOURCE,
744
- "source_type": source_resp.get("type"),
745
- "root_path": source_config.get("rootPath"),
746
- "database_name": db,
747
- }
773
+ db = source_config.get(
774
+ "database", source_config.get("databaseName", "")
775
+ )
776
+
777
+ if self.should_include_container([], source.get("path")[0]):
778
+ return {
779
+ "id": source.get("id"),
780
+ "name": source.get("path")[0],
781
+ "path": [],
782
+ "container_type": DremioEntityContainerType.SOURCE,
783
+ "source_type": source_resp.get("type"),
784
+ "root_path": source_config.get("rootPath"),
785
+ "database_name": db,
786
+ }
748
787
  else:
749
- return {
750
- "id": source.get("id"),
751
- "name": source.get("path")[0],
752
- "path": [],
753
- "container_type": DremioEntityContainerType.SPACE,
754
- }
788
+ if self.should_include_container([], source.get("path")[0]):
789
+ return {
790
+ "id": source.get("id"),
791
+ "name": source.get("path")[0],
792
+ "path": [],
793
+ "container_type": DremioEntityContainerType.SPACE,
794
+ }
795
+ return None
755
796
 
756
797
  def process_source_and_containers(source):
757
798
  container = process_source(source)
799
+ if not container:
800
+ return []
801
+
802
+ # Get sub-containers
758
803
  sub_containers = self.get_containers_for_location(
759
804
  resource_id=container.get("id"),
760
805
  path=[container.get("name")],
761
806
  )
807
+
762
808
  return [container] + sub_containers
763
809
 
764
810
  # Use ThreadPoolExecutor to parallelize the processing of sources
@@ -771,7 +817,16 @@ class DremioAPIOperations:
771
817
  }
772
818
 
773
819
  for future in concurrent.futures.as_completed(future_to_source):
774
- containers.extend(future.result())
820
+ source = future_to_source[future]
821
+ try:
822
+ containers.extend(future.result())
823
+ except Exception as exc:
824
+ logger.error(f"Error processing source: {exc}")
825
+ self.report.warning(
826
+ message="Failed to process source",
827
+ context=f"{source}",
828
+ exc=exc,
829
+ )
775
830
 
776
831
  return containers
777
832
 
@@ -785,3 +840,55 @@ class DremioAPIOperations:
785
840
  )
786
841
  else:
787
842
  return ""
843
+
844
+ def get_containers_for_location(
845
+ self, resource_id: str, path: List[str]
846
+ ) -> List[Dict[str, str]]:
847
+ containers = []
848
+
849
+ def traverse_path(location_id: str, entity_path: List[str]) -> List:
850
+ nonlocal containers
851
+ try:
852
+ response = self.get(url=f"/catalog/{location_id}")
853
+
854
+ # Check if current folder should be included
855
+ if (
856
+ response.get("entityType")
857
+ == DremioEntityContainerType.FOLDER.value.lower()
858
+ ):
859
+ folder_name = entity_path[-1]
860
+ folder_path = entity_path[:-1]
861
+
862
+ if self.should_include_container(folder_path, folder_name):
863
+ containers.append(
864
+ {
865
+ "id": location_id,
866
+ "name": folder_name,
867
+ "path": folder_path,
868
+ "container_type": DremioEntityContainerType.FOLDER,
869
+ }
870
+ )
871
+
872
+ # Recursively process child containers
873
+ for container in response.get("children", []):
874
+ if (
875
+ container.get("type")
876
+ == DremioEntityContainerType.CONTAINER.value
877
+ ):
878
+ traverse_path(container.get("id"), container.get("path"))
879
+
880
+ except Exception as exc:
881
+ logging.info(
882
+ "Location {} contains no tables or views. Skipping...".format(
883
+ location_id
884
+ )
885
+ )
886
+ self.report.warning(
887
+ message="Failed to get tables or views",
888
+ context=f"{location_id}",
889
+ exc=exc,
890
+ )
891
+
892
+ return containers
893
+
894
+ return traverse_path(location_id=resource_id, entity_path=path)
@@ -31,6 +31,7 @@ class DremioToDataHubSourceTypeMapping:
31
31
  "SNOWFLAKE": "snowflake",
32
32
  "SYNAPSE": "mssql",
33
33
  "TERADATA": "teradata",
34
+ "VERTICA": "vertica",
34
35
  }
35
36
 
36
37
  DATABASE_SOURCE_TYPES = {
@@ -52,6 +53,7 @@ class DremioToDataHubSourceTypeMapping:
52
53
  "SNOWFLAKE",
53
54
  "SYNAPSE",
54
55
  "TERADATA",
56
+ "VERTICA",
55
57
  }
56
58
 
57
59
  FILE_OBJECT_STORAGE_TYPES = {
@@ -14,12 +14,27 @@ class DremioSourceReport(
14
14
  ):
15
15
  num_containers_failed: int = 0
16
16
  num_datasets_failed: int = 0
17
+ containers_scanned: int = 0
18
+ containers_filtered: int = 0
17
19
 
18
20
  def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
19
21
  # recording total combined latency is not very useful, keeping this method as a placeholder
20
22
  # for future implementation of min / max / percentiles etc.
21
23
  pass
22
24
 
25
+ def report_container_scanned(self, name: str) -> None:
26
+ """
27
+ Record that a container was successfully scanned
28
+ """
29
+ self.containers_scanned += 1
30
+
31
+ def report_container_filtered(self, container_name: str) -> None:
32
+ """
33
+ Record that a container was filtered out
34
+ """
35
+ self.containers_filtered += 1
36
+ self.report_dropped(container_name)
37
+
23
38
  def report_entity_scanned(self, name: str, ent_type: str = "View") -> None:
24
39
  """
25
40
  Entity could be a view or a table
@@ -119,7 +119,6 @@ class SnowflakeFilter:
119
119
  ) -> bool:
120
120
  if not dataset_type or not dataset_name:
121
121
  return True
122
- dataset_params = dataset_name.split(".")
123
122
  if dataset_type.lower() not in (
124
123
  SnowflakeObjectDomain.TABLE,
125
124
  SnowflakeObjectDomain.EXTERNAL_TABLE,
@@ -131,6 +130,7 @@ class SnowflakeFilter:
131
130
  if _is_sys_table(dataset_name):
132
131
  return False
133
132
 
133
+ dataset_params = _split_qualified_name(dataset_name)
134
134
  if len(dataset_params) != 3:
135
135
  self.structured_reporter.info(
136
136
  title="Unexpected dataset pattern",
@@ -18518,6 +18518,10 @@
18518
18518
  "namespace": "com.linkedin.pegasus2avro.form",
18519
18519
  "fields": [
18520
18520
  {
18521
+ "Searchable": {
18522
+ "fieldName": "structuredPropertyPromptUrns",
18523
+ "fieldType": "URN"
18524
+ },
18521
18525
  "java": {
18522
18526
  "class": "com.linkedin.pegasus2avro.common.urn.Urn"
18523
18527
  },
@@ -97,6 +97,10 @@
97
97
  "namespace": "com.linkedin.pegasus2avro.form",
98
98
  "fields": [
99
99
  {
100
+ "Searchable": {
101
+ "fieldName": "structuredPropertyPromptUrns",
102
+ "fieldType": "URN"
103
+ },
100
104
  "java": {
101
105
  "class": "com.linkedin.pegasus2avro.common.urn.Urn"
102
106
  },
@@ -1383,8 +1383,7 @@ class SqlParsingAggregator(Closeable):
1383
1383
  return QueryUrn(query_id).urn()
1384
1384
 
1385
1385
  @classmethod
1386
- def _composite_query_id(cls, composed_of_queries: Iterable[QueryId]) -> str:
1387
- composed_of_queries = list(composed_of_queries)
1386
+ def _composite_query_id(cls, composed_of_queries: List[QueryId]) -> str:
1388
1387
  combined = json.dumps(composed_of_queries)
1389
1388
  return f"composite_{generate_hash(combined)}"
1390
1389
 
@@ -121,7 +121,7 @@ _BASIC_NORMALIZATION_RULES = {
121
121
  # Remove /* */ comments.
122
122
  re.compile(r"/\*.*?\*/", re.DOTALL): "",
123
123
  # Remove -- comments.
124
- re.compile(r"--.*$"): "",
124
+ re.compile(r"--.*$", re.MULTILINE): "",
125
125
  # Replace all runs of whitespace with a single space.
126
126
  re.compile(r"\s+"): " ",
127
127
  # Remove leading and trailing whitespace and trailing semicolons.
@@ -131,10 +131,16 @@ _BASIC_NORMALIZATION_RULES = {
131
131
  # Replace anything that looks like a string with a placeholder.
132
132
  re.compile(r"'[^']*'"): "?",
133
133
  # Replace sequences of IN/VALUES with a single placeholder.
134
- re.compile(r"\b(IN|VALUES)\s*\(\?(?:, \?)*\)", re.IGNORECASE): r"\1 (?)",
134
+ # The r" ?" makes it more robust to uneven spacing.
135
+ re.compile(r"\b(IN|VALUES)\s*\( ?\?(?:, ?\?)* ?\)", re.IGNORECASE): r"\1 (?)",
135
136
  # Normalize parenthesis spacing.
136
137
  re.compile(r"\( "): "(",
137
138
  re.compile(r" \)"): ")",
139
+ # Fix up spaces before commas in column lists.
140
+ # e.g. "col1 , col2" -> "col1, col2"
141
+ # e.g. "col1,col2" -> "col1, col2"
142
+ re.compile(r"\b ,"): ",",
143
+ re.compile(r"\b,\b"): ", ",
138
144
  }
139
145
  _TABLE_NAME_NORMALIZATION_RULES = {
140
146
  # Replace UUID-like strings with a placeholder (both - and _ variants).