acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (65) hide show
  1. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/METADATA +2659 -2578
  2. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/RECORD +65 -57
  3. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/entry_points.txt +1 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/graphql/operation.py +1 -1
  6. datahub/ingestion/autogenerated/capability_summary.json +45 -5
  7. datahub/ingestion/autogenerated/lineage.json +3 -2
  8. datahub/ingestion/run/pipeline.py +1 -0
  9. datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
  10. datahub/ingestion/source/common/subtypes.py +3 -0
  11. datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
  12. datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
  13. datahub/ingestion/source/dbt/dbt_common.py +74 -0
  14. datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
  15. datahub/ingestion/source/dremio/dremio_source.py +4 -0
  16. datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
  17. datahub/ingestion/source/excel/__init__.py +0 -0
  18. datahub/ingestion/source/excel/config.py +92 -0
  19. datahub/ingestion/source/excel/excel_file.py +539 -0
  20. datahub/ingestion/source/excel/profiling.py +308 -0
  21. datahub/ingestion/source/excel/report.py +49 -0
  22. datahub/ingestion/source/excel/source.py +662 -0
  23. datahub/ingestion/source/excel/util.py +18 -0
  24. datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
  25. datahub/ingestion/source/openapi.py +1 -1
  26. datahub/ingestion/source/powerbi/config.py +33 -0
  27. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  28. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
  29. datahub/ingestion/source/powerbi/powerbi.py +5 -0
  30. datahub/ingestion/source/s3/source.py +65 -59
  31. datahub/ingestion/source/snowflake/constants.py +2 -0
  32. datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
  33. datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
  34. datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
  35. datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
  36. datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
  37. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +21 -6
  38. datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
  39. datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
  40. datahub/ingestion/source/snowflake/snowflake_v2.py +5 -1
  41. datahub/ingestion/source/sql/hive_metastore.py +1 -0
  42. datahub/ingestion/source/sql_queries.py +24 -2
  43. datahub/ingestion/source/state/checkpoint.py +3 -28
  44. datahub/metadata/_internal_schema_classes.py +568 -512
  45. datahub/metadata/_urns/urn_defs.py +1748 -1748
  46. datahub/metadata/schema.avsc +18242 -18168
  47. datahub/metadata/schemas/ChartInfo.avsc +2 -1
  48. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
  49. datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
  50. datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
  51. datahub/metadata/schemas/Ownership.avsc +69 -0
  52. datahub/metadata/schemas/StructuredProperties.avsc +69 -0
  53. datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
  54. datahub/metadata/schemas/__init__.py +3 -3
  55. datahub/sdk/lineage_client.py +6 -26
  56. datahub/sdk/main_client.py +7 -3
  57. datahub/sdk/search_filters.py +16 -0
  58. datahub/specific/aspect_helpers/siblings.py +73 -0
  59. datahub/specific/dataset.py +2 -0
  60. datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
  61. datahub/sql_parsing/tool_meta_extractor.py +1 -3
  62. datahub/upgrade/upgrade.py +14 -2
  63. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/WHEEL +0 -0
  64. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/licenses/LICENSE +0 -0
  65. {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/top_level.txt +0 -0
@@ -353,6 +353,19 @@ class PowerBiDashboardSourceConfig(
353
353
  "For example with an ODBC connection string 'DSN=database' where the database type "
354
354
  "is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
355
355
  )
356
+ # ODBC DSN to database (or database.schema) mapping
357
+ dsn_to_database_schema: Dict[str, str] = pydantic.Field(
358
+ default={},
359
+ description="A mapping of ODBC DSN to database names with optional schema names "
360
+ "(some database platforms such a MySQL use the table name pattern 'database.table', "
361
+ "while others use the pattern 'database.schema.table'). "
362
+ "This mapping is used in conjunction with ODBC SQL query parsing. "
363
+ "If SQL queries used with ODBC do not reference fully qualified tables names, "
364
+ "then you should configure mappings for your DSNs. "
365
+ "For example with an ODBC connection string 'DSN=database' where the database "
366
+ "is 'prod' you would configure the mapping as 'database: prod'. "
367
+ "If the database is 'prod' and the schema is 'data' then mapping would be 'database: prod.data'.",
368
+ )
356
369
  # deprecated warning
357
370
  _dataset_type_mapping = pydantic_field_deprecated(
358
371
  "dataset_type_mapping",
@@ -614,3 +627,23 @@ class PowerBiDashboardSourceConfig(
614
627
  "Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
615
628
  )
616
629
  return values
630
+
631
+ @root_validator(skip_on_failure=True)
632
+ def validate_dsn_to_database_schema(cls, values: Dict) -> Dict:
633
+ if values.get("dsn_to_database_schema") is not None:
634
+ dsn_mapping = values.get("dsn_to_database_schema")
635
+ if not isinstance(dsn_mapping, dict):
636
+ raise ValueError("dsn_to_database_schema must contain key-value pairs")
637
+
638
+ for _key, value in dsn_mapping.items():
639
+ if not isinstance(value, str):
640
+ raise ValueError(
641
+ "dsn_to_database_schema mapping values must be strings"
642
+ )
643
+ parts = value.split(".")
644
+ if len(parts) != 1 and len(parts) != 2:
645
+ raise ValueError(
646
+ f"dsn_to_database_schema invalid mapping value: {value}"
647
+ )
648
+
649
+ return values
@@ -76,3 +76,4 @@ class FunctionName(Enum):
76
76
  DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
77
77
  MYSQL_DATA_ACCESS = "MySQL.Database"
78
78
  ODBC_DATA_ACCESS = "Odbc.DataSource"
79
+ ODBC_QUERY = "Odbc.Query"
@@ -3,7 +3,9 @@ from abc import ABC, abstractmethod
3
3
  from enum import Enum
4
4
  from typing import Dict, List, Optional, Tuple, Type, cast
5
5
 
6
+ import sqlglot
6
7
  from lark import Tree
8
+ from sqlglot import ParseError, expressions as exp
7
9
 
8
10
  from datahub.configuration.source_common import PlatformDetail
9
11
  from datahub.emitter import mce_builder as builder
@@ -209,15 +211,34 @@ class AbstractLineage(ABC):
209
211
 
210
212
  return None
211
213
 
214
+ @staticmethod
215
+ def is_sql_query(query: Optional[str]) -> bool:
216
+ if not query:
217
+ return False
218
+ query = native_sql_parser.remove_special_characters(query)
219
+ try:
220
+ expression = sqlglot.parse_one(query)
221
+ return isinstance(expression, exp.Select)
222
+ except (ParseError, Exception):
223
+ logger.debug(f"Failed to parse query as SQL: {query}")
224
+ return False
225
+
212
226
  def parse_custom_sql(
213
- self, query: str, server: str, database: Optional[str], schema: Optional[str]
227
+ self,
228
+ query: str,
229
+ server: str,
230
+ database: Optional[str],
231
+ schema: Optional[str],
232
+ platform_pair: Optional[DataPlatformPair] = None,
214
233
  ) -> Lineage:
215
234
  dataplatform_tables: List[DataPlatformTable] = []
235
+ if not platform_pair:
236
+ platform_pair = self.get_platform_pair()
216
237
 
217
238
  platform_detail: PlatformDetail = (
218
239
  self.platform_instance_resolver.get_platform_instance(
219
240
  PowerBIPlatformDetail(
220
- data_platform_pair=self.get_platform_pair(),
241
+ data_platform_pair=platform_pair,
221
242
  data_platform_server=server,
222
243
  )
223
244
  )
@@ -231,7 +252,7 @@ class AbstractLineage(ABC):
231
252
  native_sql_parser.parse_custom_sql(
232
253
  ctx=self.ctx,
233
254
  query=query,
234
- platform=self.get_platform_pair().datahub_data_platform_name,
255
+ platform=platform_pair.datahub_data_platform_name,
235
256
  platform_instance=platform_detail.platform_instance,
236
257
  env=platform_detail.env,
237
258
  database=database,
@@ -258,7 +279,7 @@ class AbstractLineage(ABC):
258
279
  for urn in parsed_result.in_tables:
259
280
  dataplatform_tables.append(
260
281
  DataPlatformTable(
261
- data_platform_pair=self.get_platform_pair(),
282
+ data_platform_pair=platform_pair,
262
283
  urn=urn,
263
284
  )
264
285
  )
@@ -956,7 +977,7 @@ class OdbcLineage(AbstractLineage):
956
977
  f"data-access function detail {data_access_func_detail}"
957
978
  )
958
979
 
959
- connect_string, _ = self.get_db_detail_from_argument(
980
+ connect_string, query = self.get_db_detail_from_argument(
960
981
  data_access_func_detail.arg_list
961
982
  )
962
983
 
@@ -972,12 +993,19 @@ class OdbcLineage(AbstractLineage):
972
993
  data_platform, powerbi_platform = extract_platform(connect_string)
973
994
  server_name = extract_server(connect_string)
974
995
 
996
+ dsn = extract_dsn(connect_string)
997
+ if not dsn:
998
+ self.reporter.warning(
999
+ title="Can not determine ODBC DSN",
1000
+ message="Can not extract DSN from ODBC connect string. Skipping Lineage creation.",
1001
+ context=f"table-name={self.table.full_name}, connect-string={connect_string}",
1002
+ )
1003
+ return Lineage.empty()
1004
+ logger.debug(f"Extracted DSN: {dsn}")
1005
+
975
1006
  if not data_platform:
976
- dsn = extract_dsn(connect_string)
977
- if dsn:
978
- logger.debug(f"Extracted DSN: {dsn}")
979
- server_name = dsn
980
- if dsn and self.config.dsn_to_platform_name:
1007
+ server_name = dsn
1008
+ if self.config.dsn_to_platform_name:
981
1009
  logger.debug(f"Attempting to map DSN {dsn} to platform")
982
1010
  name = self.config.dsn_to_platform_name.get(dsn)
983
1011
  if name:
@@ -1006,6 +1034,63 @@ class OdbcLineage(AbstractLineage):
1006
1034
  elif not server_name:
1007
1035
  server_name = "unknown"
1008
1036
 
1037
+ if self.is_sql_query(query):
1038
+ return self.query_lineage(query, platform_pair, server_name, dsn)
1039
+ else:
1040
+ return self.expression_lineage(
1041
+ data_access_func_detail, data_platform, platform_pair, server_name
1042
+ )
1043
+
1044
+ def query_lineage(
1045
+ self,
1046
+ query: Optional[str],
1047
+ platform_pair: DataPlatformPair,
1048
+ server_name: str,
1049
+ dsn: str,
1050
+ ) -> Lineage:
1051
+ database = None
1052
+ schema = None
1053
+
1054
+ if not query:
1055
+ # query should never be None as it is checked before calling this function.
1056
+ # however, we need to check just in case.
1057
+ self.reporter.warning(
1058
+ title="ODBC Query is null",
1059
+ message="No SQL to parse. Skipping Lineage creation.",
1060
+ context=f"table-name={self.table.full_name}",
1061
+ )
1062
+ return Lineage.empty()
1063
+
1064
+ if self.config.dsn_to_database_schema:
1065
+ value = self.config.dsn_to_database_schema.get(dsn)
1066
+ if value:
1067
+ parts = value.split(".")
1068
+ if len(parts) == 1:
1069
+ database = parts[0]
1070
+ elif len(parts) == 2:
1071
+ database = parts[0]
1072
+ schema = parts[1]
1073
+
1074
+ logger.debug(
1075
+ f"ODBC query processing: dsn={dsn} mapped to database={database}, schema={schema}"
1076
+ )
1077
+ result = self.parse_custom_sql(
1078
+ query=query,
1079
+ server=server_name,
1080
+ database=database,
1081
+ schema=schema,
1082
+ platform_pair=platform_pair,
1083
+ )
1084
+ logger.debug(f"ODBC query lineage generated {len(result.upstreams)} upstreams")
1085
+ return result
1086
+
1087
+ def expression_lineage(
1088
+ self,
1089
+ data_access_func_detail: DataAccessFunctionDetail,
1090
+ data_platform: str,
1091
+ platform_pair: DataPlatformPair,
1092
+ server_name: str,
1093
+ ) -> Lineage:
1009
1094
  database_name = None
1010
1095
  schema_name = None
1011
1096
  table_name = None
@@ -1144,6 +1229,11 @@ class SupportedPattern(Enum):
1144
1229
  FunctionName.ODBC_DATA_ACCESS,
1145
1230
  )
1146
1231
 
1232
+ ODBC_QUERY = (
1233
+ OdbcLineage,
1234
+ FunctionName.ODBC_QUERY,
1235
+ )
1236
+
1147
1237
  def handler(self) -> Type[AbstractLineage]:
1148
1238
  return self.value[0]
1149
1239
 
@@ -40,6 +40,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
40
40
  from datahub.ingestion.source.common.subtypes import (
41
41
  BIAssetSubTypes,
42
42
  BIContainerSubTypes,
43
+ SourceCapabilityModifier,
43
44
  )
44
45
  from datahub.ingestion.source.powerbi.config import (
45
46
  Constant,
@@ -1229,6 +1230,10 @@ class Mapper:
1229
1230
  @capability(
1230
1231
  SourceCapability.CONTAINERS,
1231
1232
  "Enabled by default",
1233
+ subtype_modifier=[
1234
+ SourceCapabilityModifier.POWERBI_WORKSPACE,
1235
+ SourceCapabilityModifier.POWERBI_DATASET,
1236
+ ],
1232
1237
  )
1233
1238
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
1234
1239
  @capability(SourceCapability.OWNERSHIP, "Enabled by default")
@@ -34,7 +34,13 @@ from datahub.ingestion.api.decorators import (
34
34
  )
35
35
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor
36
36
  from datahub.ingestion.api.workunit import MetadataWorkUnit
37
- from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags, list_folders
37
+ from datahub.ingestion.source.aws.s3_boto_utils import (
38
+ get_s3_tags,
39
+ list_folders,
40
+ list_folders_path,
41
+ list_objects_recursive,
42
+ list_objects_recursive_path,
43
+ )
38
44
  from datahub.ingestion.source.aws.s3_util import (
39
45
  get_bucket_name,
40
46
  get_bucket_relative_path,
@@ -84,8 +90,6 @@ if TYPE_CHECKING:
84
90
  logging.getLogger("py4j").setLevel(logging.ERROR)
85
91
  logger: logging.Logger = logging.getLogger(__name__)
86
92
 
87
- PAGE_SIZE = 1000
88
-
89
93
  # Hack to support the .gzip extension with smart_open.
90
94
  so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"])
91
95
 
@@ -384,7 +388,10 @@ class S3Source(StatefulIngestionSourceBase):
384
388
 
385
389
  def read_file_spark(self, file: str, ext: str) -> Optional[DataFrame]:
386
390
  logger.debug(f"Opening file {file} for profiling in spark")
387
- file = file.replace("s3://", "s3a://")
391
+ if "s3://" in file:
392
+ # replace s3:// with s3a://, and make sure standalone bucket names always end with a slash.
393
+ # Spark will fail if given a path like `s3a://mybucket`, and requires it to be `s3a://mybucket/`.
394
+ file = f"s3a://{get_bucket_name(file)}/{get_bucket_relative_path(file)}"
388
395
 
389
396
  telemetry.telemetry_instance.ping("data_lake_file", {"extension": ext})
390
397
 
@@ -836,29 +843,31 @@ class S3Source(StatefulIngestionSourceBase):
836
843
  content_type=browse_path.content_type,
837
844
  )
838
845
 
839
- def resolve_templated_folders(self, bucket_name: str, prefix: str) -> Iterable[str]:
846
+ def resolve_templated_folders(self, prefix: str) -> Iterable[str]:
840
847
  folder_split: List[str] = prefix.split("*", 1)
841
848
  # If the len of split is 1 it means we don't have * in the prefix
842
849
  if len(folder_split) == 1:
843
850
  yield prefix
844
851
  return
845
852
 
846
- folders: Iterable[str] = list_folders(
847
- bucket_name, folder_split[0], self.source_config.aws_config
853
+ basename_startswith = folder_split[0].split("/")[-1]
854
+ dirname = folder_split[0].removesuffix(basename_startswith)
855
+
856
+ folders = list_folders_path(
857
+ dirname,
858
+ startswith=basename_startswith,
859
+ aws_config=self.source_config.aws_config,
848
860
  )
849
861
  for folder in folders:
850
- # Ensure proper path joining - folder already includes trailing slash from list_folders
851
- # but we need to handle the case where folder_split[1] might start with a slash
862
+ # Ensure proper path joining - folders from list_folders path never include a
863
+ # trailing slash, but we need to handle the case where folder_split[1] might
864
+ # start with a slash
852
865
  remaining_pattern = folder_split[1]
853
866
  if remaining_pattern.startswith("/"):
854
867
  remaining_pattern = remaining_pattern[1:]
855
868
 
856
- # Ensure folder ends with slash for proper path construction
857
- if not folder.endswith("/"):
858
- folder = folder + "/"
859
-
860
869
  yield from self.resolve_templated_folders(
861
- bucket_name, f"{folder}{remaining_pattern}"
870
+ f"{folder.path}/{remaining_pattern}"
862
871
  )
863
872
 
864
873
  def get_dir_to_process(
@@ -942,7 +951,9 @@ class S3Source(StatefulIngestionSourceBase):
942
951
  # Instead of loading all objects into memory, we'll accumulate folder data incrementally
943
952
  folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
944
953
 
945
- for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
954
+ for obj in list_objects_recursive(
955
+ bucket.name, prefix, self.source_config.aws_config
956
+ ):
946
957
  s3_path = self.create_s3_path(obj.bucket_name, obj.key)
947
958
 
948
959
  if not _is_allowed_path(path_spec, s3_path):
@@ -1016,13 +1027,6 @@ class S3Source(StatefulIngestionSourceBase):
1016
1027
  if self.source_config.aws_config is None:
1017
1028
  raise ValueError("aws_config not set. Cannot browse s3")
1018
1029
 
1019
- s3 = self.source_config.aws_config.get_s3_resource(
1020
- self.source_config.verify_ssl
1021
- )
1022
- bucket_name = get_bucket_name(path_spec.include)
1023
- bucket = s3.Bucket(bucket_name)
1024
-
1025
- logger.debug(f"Scanning bucket: {bucket_name}")
1026
1030
  logger.info(f"Processing path spec: {path_spec.include}")
1027
1031
 
1028
1032
  # Check if we have {table} template in the path
@@ -1034,16 +1038,14 @@ class S3Source(StatefulIngestionSourceBase):
1034
1038
  logger.info("Using templated path processing")
1035
1039
  # Always use templated processing when {table} is present
1036
1040
  # This groups files under table-level datasets
1037
- yield from self._process_templated_path(path_spec, bucket, bucket_name)
1041
+ yield from self._process_templated_path(path_spec)
1038
1042
  else:
1039
1043
  logger.info("Using simple path processing")
1040
1044
  # Only use simple processing for non-templated paths
1041
1045
  # This creates individual file-level datasets
1042
- yield from self._process_simple_path(path_spec, bucket, bucket_name)
1046
+ yield from self._process_simple_path(path_spec)
1043
1047
 
1044
- def _process_templated_path(
1045
- self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
1046
- ) -> Iterable[BrowsePath]:
1048
+ def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]: # noqa: C901
1047
1049
  """
1048
1050
  Process S3 paths containing {table} templates to create table-level datasets.
1049
1051
 
@@ -1057,12 +1059,17 @@ class S3Source(StatefulIngestionSourceBase):
1057
1059
 
1058
1060
  Args:
1059
1061
  path_spec: Path specification with {table} template
1060
- bucket: S3 bucket resource
1061
- bucket_name: Name of the S3 bucket
1062
1062
 
1063
1063
  Yields:
1064
1064
  BrowsePath: One per table (not per file), containing aggregated metadata
1065
1065
  """
1066
+
1067
+ if self.source_config.aws_config is None:
1068
+ raise ValueError("aws_config not set. Cannot browse s3")
1069
+ s3 = self.source_config.aws_config.get_s3_resource(
1070
+ self.source_config.verify_ssl
1071
+ )
1072
+
1066
1073
  # Find the part before {table}
1067
1074
  table_marker = "{table}"
1068
1075
  if table_marker not in path_spec.include:
@@ -1097,20 +1104,13 @@ class S3Source(StatefulIngestionSourceBase):
1097
1104
 
1098
1105
  # Split the path at {table} to get the prefix that needs wildcard resolution
1099
1106
  prefix_before_table = include.split(table_marker)[0]
1100
- # Remove the s3:// and bucket name to get the relative path
1101
- relative_path = get_bucket_relative_path(prefix_before_table)
1102
-
1103
1107
  logger.info(f"Prefix before table: {prefix_before_table}")
1104
- logger.info(f"Relative path for resolution: {relative_path}")
1105
1108
 
1106
1109
  try:
1107
1110
  # STEP 2: Resolve ALL wildcards in the path up to {table}
1108
- # This converts patterns like "data/*/logs/" to actual paths like ["data/2023/logs/", "data/2024/logs/"]
1109
- table_index = include.find(table_marker)
1110
- folder_prefix = get_bucket_relative_path(include[:table_index])
1111
-
1111
+ # This converts patterns like "s3://data/*/logs/" to actual paths like ["s3://data/2023/logs/", "s3://data/2024/logs/"]
1112
1112
  resolved_prefixes = list(
1113
- self.resolve_templated_folders(bucket_name, folder_prefix)
1113
+ self.resolve_templated_folders(prefix_before_table)
1114
1114
  )
1115
1115
  logger.info(f"Resolved prefixes: {resolved_prefixes}")
1116
1116
 
@@ -1121,20 +1121,22 @@ class S3Source(StatefulIngestionSourceBase):
1121
1121
  # Get all folders that could be tables under this resolved prefix
1122
1122
  # These are the actual table names (e.g., "users", "events", "logs")
1123
1123
  table_folders = list(
1124
- list_folders(
1125
- bucket_name, resolved_prefix, self.source_config.aws_config
1124
+ list_folders_path(
1125
+ resolved_prefix, aws_config=self.source_config.aws_config
1126
1126
  )
1127
1127
  )
1128
1128
  logger.debug(
1129
- f"Found table folders under {resolved_prefix}: {table_folders}"
1129
+ f"Found table folders under {resolved_prefix}: {[folder.name for folder in table_folders]}"
1130
1130
  )
1131
1131
 
1132
1132
  # STEP 4: Process each table folder to create a table-level dataset
1133
- for table_folder in table_folders:
1133
+ for folder in table_folders:
1134
+ bucket_name = get_bucket_name(folder.path)
1135
+ table_folder = get_bucket_relative_path(folder.path)
1136
+ bucket = s3.Bucket(bucket_name)
1137
+
1134
1138
  # Create the full S3 path for this table
1135
- table_s3_path = self.create_s3_path(
1136
- bucket_name, table_folder.rstrip("/")
1137
- )
1139
+ table_s3_path = self.create_s3_path(bucket_name, table_folder)
1138
1140
  logger.info(
1139
1141
  f"Processing table folder: {table_folder} -> {table_s3_path}"
1140
1142
  )
@@ -1269,17 +1271,16 @@ class S3Source(StatefulIngestionSourceBase):
1269
1271
  )
1270
1272
 
1271
1273
  except Exception as e:
1272
- if "NoSuchBucket" in repr(e):
1274
+ if isinstance(e, s3.meta.client.exceptions.NoSuchBucket):
1273
1275
  self.get_report().report_warning(
1274
- "Missing bucket", f"No bucket found {bucket_name}"
1276
+ "Missing bucket",
1277
+ f"No bucket found {e.response['Error'].get('BucketName')}",
1275
1278
  )
1276
1279
  return
1277
1280
  logger.error(f"Error in _process_templated_path: {e}")
1278
1281
  raise e
1279
1282
 
1280
- def _process_simple_path(
1281
- self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
1282
- ) -> Iterable[BrowsePath]:
1283
+ def _process_simple_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
1283
1284
  """
1284
1285
  Process simple S3 paths without {table} templates to create file-level datasets.
1285
1286
 
@@ -1295,8 +1296,6 @@ class S3Source(StatefulIngestionSourceBase):
1295
1296
 
1296
1297
  Args:
1297
1298
  path_spec: Path specification without {table} template
1298
- bucket: S3 bucket resource
1299
- bucket_name: Name of the S3 bucket
1300
1299
 
1301
1300
  Yields:
1302
1301
  BrowsePath: One per file, containing individual file metadata
@@ -1305,20 +1304,27 @@ class S3Source(StatefulIngestionSourceBase):
1305
1304
  - BrowsePath(file="data/file1.csv", size=1000, partitions=[])
1306
1305
  - BrowsePath(file="data/file2.csv", size=2000, partitions=[])
1307
1306
  """
1308
- assert self.source_config.aws_config is not None, "aws_config not set"
1307
+
1308
+ if self.source_config.aws_config is None:
1309
+ raise ValueError("aws_config not set")
1310
+ s3 = self.source_config.aws_config.get_s3_resource(
1311
+ self.source_config.verify_ssl
1312
+ )
1309
1313
 
1310
1314
  path_spec.sample_files = False # Disable sampling for simple paths
1311
1315
 
1312
1316
  # Extract the prefix from the path spec (stops at first wildcard)
1313
- prefix = self.get_prefix(get_bucket_relative_path(path_spec.include))
1317
+ prefix = self.get_prefix(path_spec.include)
1314
1318
 
1315
- # Get s3 resource for content type checking
1316
- s3 = self.source_config.aws_config.get_s3_resource(
1317
- self.source_config.verify_ssl
1318
- )
1319
+ basename_startswith = prefix.split("/")[-1]
1320
+ dirname = prefix.removesuffix(basename_startswith)
1319
1321
 
1320
1322
  # Iterate through all objects in the bucket matching the prefix
1321
- for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
1323
+ for obj in list_objects_recursive_path(
1324
+ dirname,
1325
+ startswith=basename_startswith,
1326
+ aws_config=self.source_config.aws_config,
1327
+ ):
1322
1328
  s3_path = self.create_s3_path(obj.bucket_name, obj.key)
1323
1329
 
1324
1330
  # Get content type if configured
@@ -9,6 +9,8 @@ class SnowflakeCloudProvider(StrEnum):
9
9
 
10
10
  SNOWFLAKE_DEFAULT_CLOUD = SnowflakeCloudProvider.AWS
11
11
 
12
+ DEFAULT_SNOWFLAKE_DOMAIN = "snowflakecomputing.com"
13
+
12
14
 
13
15
  class SnowflakeEdition(StrEnum):
14
16
  STANDARD = "Standard"
@@ -216,6 +216,16 @@ class SnowflakeV2Config(
216
216
  description="If enabled, populates the ingested views' definitions.",
217
217
  )
218
218
 
219
+ fetch_views_from_information_schema: bool = Field(
220
+ default=False,
221
+ description="If enabled, uses information_schema.views to fetch view definitions instead of SHOW VIEWS command. "
222
+ "This alternative method can be more reliable for databases with large numbers of views (> 10K views), as the "
223
+ "SHOW VIEWS approach has proven unreliable and can lead to missing views in such scenarios. However, this method "
224
+ "requires OWNERSHIP privileges on views to retrieve their definitions. For views without ownership permissions "
225
+ "(where VIEW_DEFINITION is null/empty), the system will automatically fall back to using batched SHOW VIEWS queries "
226
+ "to populate the missing definitions.",
227
+ )
228
+
219
229
  include_technical_schema: bool = Field(
220
230
  default=True,
221
231
  description="If enabled, populates the snowflake technical schema and descriptions.",
@@ -22,6 +22,7 @@ from datahub.ingestion.api.closeable import Closeable
22
22
  from datahub.ingestion.source.snowflake.constants import (
23
23
  CLIENT_PREFETCH_THREADS,
24
24
  CLIENT_SESSION_KEEP_ALIVE,
25
+ DEFAULT_SNOWFLAKE_DOMAIN,
25
26
  )
26
27
  from datahub.ingestion.source.snowflake.oauth_config import (
27
28
  OAuthConfiguration,
@@ -47,8 +48,6 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
47
48
  "OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
48
49
  }
49
50
 
50
- _SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
51
-
52
51
 
53
52
  class SnowflakePermissionError(MetaError):
54
53
  """A permission error has happened"""
@@ -110,6 +109,10 @@ class SnowflakeConnectionConfig(ConfigModel):
110
109
  default=None,
111
110
  description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
112
111
  )
112
+ snowflake_domain: str = pydantic.Field(
113
+ default=DEFAULT_SNOWFLAKE_DOMAIN,
114
+ description="Snowflake domain. Use 'snowflakecomputing.com' for most regions or 'snowflakecomputing.cn' for China (cn-northwest-1) region.",
115
+ )
113
116
 
114
117
  def get_account(self) -> str:
115
118
  assert self.account_id
@@ -118,10 +121,13 @@ class SnowflakeConnectionConfig(ConfigModel):
118
121
  rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id")
119
122
 
120
123
  @pydantic.validator("account_id")
121
- def validate_account_id(cls, account_id: str) -> str:
124
+ def validate_account_id(cls, account_id: str, values: Dict) -> str:
122
125
  account_id = remove_protocol(account_id)
123
126
  account_id = remove_trailing_slashes(account_id)
124
- account_id = remove_suffix(account_id, _SNOWFLAKE_HOST_SUFFIX)
127
+ # Get the domain from config, fallback to default
128
+ domain = values.get("snowflake_domain", DEFAULT_SNOWFLAKE_DOMAIN)
129
+ snowflake_host_suffix = f".{domain}"
130
+ account_id = remove_suffix(account_id, snowflake_host_suffix)
125
131
  return account_id
126
132
 
127
133
  @pydantic.validator("authentication_type", always=True)
@@ -311,6 +317,7 @@ class SnowflakeConnectionConfig(ConfigModel):
311
317
  warehouse=self.warehouse,
312
318
  authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
313
319
  application=_APPLICATION_NAME,
320
+ host=f"{self.account_id}.{self.snowflake_domain}",
314
321
  **connect_args,
315
322
  )
316
323
 
@@ -324,6 +331,7 @@ class SnowflakeConnectionConfig(ConfigModel):
324
331
  role=self.role,
325
332
  authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
326
333
  application=_APPLICATION_NAME,
334
+ host=f"{self.account_id}.{self.snowflake_domain}",
327
335
  **connect_args,
328
336
  )
329
337
 
@@ -337,6 +345,7 @@ class SnowflakeConnectionConfig(ConfigModel):
337
345
  warehouse=self.warehouse,
338
346
  role=self.role,
339
347
  application=_APPLICATION_NAME,
348
+ host=f"{self.account_id}.{self.snowflake_domain}",
340
349
  **connect_args,
341
350
  )
342
351
  elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
@@ -348,6 +357,7 @@ class SnowflakeConnectionConfig(ConfigModel):
348
357
  warehouse=self.warehouse,
349
358
  role=self.role,
350
359
  application=_APPLICATION_NAME,
360
+ host=f"{self.account_id}.{self.snowflake_domain}",
351
361
  **connect_args,
352
362
  )
353
363
  elif self.authentication_type == "OAUTH_AUTHENTICATOR":
@@ -363,6 +373,7 @@ class SnowflakeConnectionConfig(ConfigModel):
363
373
  role=self.role,
364
374
  authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
365
375
  application=_APPLICATION_NAME,
376
+ host=f"{self.account_id}.{self.snowflake_domain}",
366
377
  **connect_args,
367
378
  )
368
379
  else:
@@ -408,7 +419,7 @@ class SnowflakeConnection(Closeable):
408
419
  # We often run multiple queries in parallel across multiple threads,
409
420
  # so we need to number them to help with log readability.
410
421
  query_num = self.get_query_no()
411
- logger.info(f"Query #{query_num}: {query}", stacklevel=2)
422
+ logger.info(f"Query #{query_num}: {query.rstrip()}", stacklevel=2)
412
423
  resp = self._connection.cursor(DictCursor).execute(query)
413
424
  if resp is not None and resp.rowcount is not None:
414
425
  logger.info(
@@ -266,6 +266,33 @@ SHOW VIEWS IN DATABASE "{db_name}"
266
266
  LIMIT {limit} {from_clause};
267
267
  """
268
268
 
269
+ @staticmethod
270
+ def get_views_for_database(db_name: str) -> str:
271
+ # We've seen some issues with the `SHOW VIEWS` query,
272
+ # particularly when it requires pagination.
273
+ # This is an experimental alternative query that might be more reliable.
274
+ return f"""\
275
+ SELECT
276
+ TABLE_CATALOG as "VIEW_CATALOG",
277
+ TABLE_SCHEMA as "VIEW_SCHEMA",
278
+ TABLE_NAME as "VIEW_NAME",
279
+ COMMENT,
280
+ VIEW_DEFINITION,
281
+ CREATED,
282
+ LAST_ALTERED,
283
+ IS_SECURE
284
+ FROM "{db_name}".information_schema.views
285
+ WHERE TABLE_CATALOG = '{db_name}'
286
+ AND TABLE_SCHEMA != 'INFORMATION_SCHEMA'
287
+ """
288
+
289
+ @staticmethod
290
+ def get_views_for_schema(db_name: str, schema_name: str) -> str:
291
+ return f"""\
292
+ {SnowflakeQuery.get_views_for_database(db_name).rstrip()}
293
+ AND TABLE_SCHEMA = '{schema_name}'
294
+ """
295
+
269
296
  @staticmethod
270
297
  def get_secure_view_definitions() -> str:
271
298
  # https://docs.snowflake.com/en/sql-reference/account-usage/views
@@ -128,6 +128,7 @@ class SnowflakeV2Report(
128
128
  # "Information schema query returned too much data. Please repeat query with more selective predicates.""
129
129
  # This will result in overall increase in time complexity
130
130
  num_get_tables_for_schema_queries: int = 0
131
+ num_get_views_for_schema_queries: int = 0
131
132
 
132
133
  # these will be non-zero if the user choses to enable the extract_tags = "with_lineage" option, which requires
133
134
  # individual queries per object (database, schema, table) and an extra query per table to get the tags on the columns.