acryl-datahub 1.2.0.6rc1__py3-none-any.whl → 1.2.0.7rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/METADATA +2659 -2578
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/RECORD +65 -57
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/graphql/operation.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +45 -5
- datahub/ingestion/autogenerated/lineage.json +3 -2
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
- datahub/ingestion/source/dbt/dbt_common.py +74 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_source.py +4 -0
- datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +33 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +5 -0
- datahub/ingestion/source/s3/source.py +65 -59
- datahub/ingestion/source/snowflake/constants.py +2 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
- datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +21 -6
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
- datahub/ingestion/source/snowflake/snowflake_v2.py +5 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -0
- datahub/ingestion/source/sql_queries.py +24 -2
- datahub/ingestion/source/state/checkpoint.py +3 -28
- datahub/metadata/_internal_schema_classes.py +568 -512
- datahub/metadata/_urns/urn_defs.py +1748 -1748
- datahub/metadata/schema.avsc +18242 -18168
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/lineage_client.py +6 -26
- datahub/sdk/main_client.py +7 -3
- datahub/sdk/search_filters.py +16 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/dataset.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/upgrade/upgrade.py +14 -2
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.6rc1.dist-info → acryl_datahub-1.2.0.7rc2.dist-info}/top_level.txt +0 -0
|
@@ -353,6 +353,19 @@ class PowerBiDashboardSourceConfig(
|
|
|
353
353
|
"For example with an ODBC connection string 'DSN=database' where the database type "
|
|
354
354
|
"is 'PostgreSQL' you would configure the mapping as 'database: postgres'.",
|
|
355
355
|
)
|
|
356
|
+
# ODBC DSN to database (or database.schema) mapping
|
|
357
|
+
dsn_to_database_schema: Dict[str, str] = pydantic.Field(
|
|
358
|
+
default={},
|
|
359
|
+
description="A mapping of ODBC DSN to database names with optional schema names "
|
|
360
|
+
"(some database platforms such a MySQL use the table name pattern 'database.table', "
|
|
361
|
+
"while others use the pattern 'database.schema.table'). "
|
|
362
|
+
"This mapping is used in conjunction with ODBC SQL query parsing. "
|
|
363
|
+
"If SQL queries used with ODBC do not reference fully qualified tables names, "
|
|
364
|
+
"then you should configure mappings for your DSNs. "
|
|
365
|
+
"For example with an ODBC connection string 'DSN=database' where the database "
|
|
366
|
+
"is 'prod' you would configure the mapping as 'database: prod'. "
|
|
367
|
+
"If the database is 'prod' and the schema is 'data' then mapping would be 'database: prod.data'.",
|
|
368
|
+
)
|
|
356
369
|
# deprecated warning
|
|
357
370
|
_dataset_type_mapping = pydantic_field_deprecated(
|
|
358
371
|
"dataset_type_mapping",
|
|
@@ -614,3 +627,23 @@ class PowerBiDashboardSourceConfig(
|
|
|
614
627
|
"Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
|
|
615
628
|
)
|
|
616
629
|
return values
|
|
630
|
+
|
|
631
|
+
@root_validator(skip_on_failure=True)
|
|
632
|
+
def validate_dsn_to_database_schema(cls, values: Dict) -> Dict:
|
|
633
|
+
if values.get("dsn_to_database_schema") is not None:
|
|
634
|
+
dsn_mapping = values.get("dsn_to_database_schema")
|
|
635
|
+
if not isinstance(dsn_mapping, dict):
|
|
636
|
+
raise ValueError("dsn_to_database_schema must contain key-value pairs")
|
|
637
|
+
|
|
638
|
+
for _key, value in dsn_mapping.items():
|
|
639
|
+
if not isinstance(value, str):
|
|
640
|
+
raise ValueError(
|
|
641
|
+
"dsn_to_database_schema mapping values must be strings"
|
|
642
|
+
)
|
|
643
|
+
parts = value.split(".")
|
|
644
|
+
if len(parts) != 1 and len(parts) != 2:
|
|
645
|
+
raise ValueError(
|
|
646
|
+
f"dsn_to_database_schema invalid mapping value: {value}"
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
return values
|
|
@@ -3,7 +3,9 @@ from abc import ABC, abstractmethod
|
|
|
3
3
|
from enum import Enum
|
|
4
4
|
from typing import Dict, List, Optional, Tuple, Type, cast
|
|
5
5
|
|
|
6
|
+
import sqlglot
|
|
6
7
|
from lark import Tree
|
|
8
|
+
from sqlglot import ParseError, expressions as exp
|
|
7
9
|
|
|
8
10
|
from datahub.configuration.source_common import PlatformDetail
|
|
9
11
|
from datahub.emitter import mce_builder as builder
|
|
@@ -209,15 +211,34 @@ class AbstractLineage(ABC):
|
|
|
209
211
|
|
|
210
212
|
return None
|
|
211
213
|
|
|
214
|
+
@staticmethod
|
|
215
|
+
def is_sql_query(query: Optional[str]) -> bool:
|
|
216
|
+
if not query:
|
|
217
|
+
return False
|
|
218
|
+
query = native_sql_parser.remove_special_characters(query)
|
|
219
|
+
try:
|
|
220
|
+
expression = sqlglot.parse_one(query)
|
|
221
|
+
return isinstance(expression, exp.Select)
|
|
222
|
+
except (ParseError, Exception):
|
|
223
|
+
logger.debug(f"Failed to parse query as SQL: {query}")
|
|
224
|
+
return False
|
|
225
|
+
|
|
212
226
|
def parse_custom_sql(
|
|
213
|
-
self,
|
|
227
|
+
self,
|
|
228
|
+
query: str,
|
|
229
|
+
server: str,
|
|
230
|
+
database: Optional[str],
|
|
231
|
+
schema: Optional[str],
|
|
232
|
+
platform_pair: Optional[DataPlatformPair] = None,
|
|
214
233
|
) -> Lineage:
|
|
215
234
|
dataplatform_tables: List[DataPlatformTable] = []
|
|
235
|
+
if not platform_pair:
|
|
236
|
+
platform_pair = self.get_platform_pair()
|
|
216
237
|
|
|
217
238
|
platform_detail: PlatformDetail = (
|
|
218
239
|
self.platform_instance_resolver.get_platform_instance(
|
|
219
240
|
PowerBIPlatformDetail(
|
|
220
|
-
data_platform_pair=
|
|
241
|
+
data_platform_pair=platform_pair,
|
|
221
242
|
data_platform_server=server,
|
|
222
243
|
)
|
|
223
244
|
)
|
|
@@ -231,7 +252,7 @@ class AbstractLineage(ABC):
|
|
|
231
252
|
native_sql_parser.parse_custom_sql(
|
|
232
253
|
ctx=self.ctx,
|
|
233
254
|
query=query,
|
|
234
|
-
platform=
|
|
255
|
+
platform=platform_pair.datahub_data_platform_name,
|
|
235
256
|
platform_instance=platform_detail.platform_instance,
|
|
236
257
|
env=platform_detail.env,
|
|
237
258
|
database=database,
|
|
@@ -258,7 +279,7 @@ class AbstractLineage(ABC):
|
|
|
258
279
|
for urn in parsed_result.in_tables:
|
|
259
280
|
dataplatform_tables.append(
|
|
260
281
|
DataPlatformTable(
|
|
261
|
-
data_platform_pair=
|
|
282
|
+
data_platform_pair=platform_pair,
|
|
262
283
|
urn=urn,
|
|
263
284
|
)
|
|
264
285
|
)
|
|
@@ -956,7 +977,7 @@ class OdbcLineage(AbstractLineage):
|
|
|
956
977
|
f"data-access function detail {data_access_func_detail}"
|
|
957
978
|
)
|
|
958
979
|
|
|
959
|
-
connect_string,
|
|
980
|
+
connect_string, query = self.get_db_detail_from_argument(
|
|
960
981
|
data_access_func_detail.arg_list
|
|
961
982
|
)
|
|
962
983
|
|
|
@@ -972,12 +993,19 @@ class OdbcLineage(AbstractLineage):
|
|
|
972
993
|
data_platform, powerbi_platform = extract_platform(connect_string)
|
|
973
994
|
server_name = extract_server(connect_string)
|
|
974
995
|
|
|
996
|
+
dsn = extract_dsn(connect_string)
|
|
997
|
+
if not dsn:
|
|
998
|
+
self.reporter.warning(
|
|
999
|
+
title="Can not determine ODBC DSN",
|
|
1000
|
+
message="Can not extract DSN from ODBC connect string. Skipping Lineage creation.",
|
|
1001
|
+
context=f"table-name={self.table.full_name}, connect-string={connect_string}",
|
|
1002
|
+
)
|
|
1003
|
+
return Lineage.empty()
|
|
1004
|
+
logger.debug(f"Extracted DSN: {dsn}")
|
|
1005
|
+
|
|
975
1006
|
if not data_platform:
|
|
976
|
-
|
|
977
|
-
if
|
|
978
|
-
logger.debug(f"Extracted DSN: {dsn}")
|
|
979
|
-
server_name = dsn
|
|
980
|
-
if dsn and self.config.dsn_to_platform_name:
|
|
1007
|
+
server_name = dsn
|
|
1008
|
+
if self.config.dsn_to_platform_name:
|
|
981
1009
|
logger.debug(f"Attempting to map DSN {dsn} to platform")
|
|
982
1010
|
name = self.config.dsn_to_platform_name.get(dsn)
|
|
983
1011
|
if name:
|
|
@@ -1006,6 +1034,63 @@ class OdbcLineage(AbstractLineage):
|
|
|
1006
1034
|
elif not server_name:
|
|
1007
1035
|
server_name = "unknown"
|
|
1008
1036
|
|
|
1037
|
+
if self.is_sql_query(query):
|
|
1038
|
+
return self.query_lineage(query, platform_pair, server_name, dsn)
|
|
1039
|
+
else:
|
|
1040
|
+
return self.expression_lineage(
|
|
1041
|
+
data_access_func_detail, data_platform, platform_pair, server_name
|
|
1042
|
+
)
|
|
1043
|
+
|
|
1044
|
+
def query_lineage(
|
|
1045
|
+
self,
|
|
1046
|
+
query: Optional[str],
|
|
1047
|
+
platform_pair: DataPlatformPair,
|
|
1048
|
+
server_name: str,
|
|
1049
|
+
dsn: str,
|
|
1050
|
+
) -> Lineage:
|
|
1051
|
+
database = None
|
|
1052
|
+
schema = None
|
|
1053
|
+
|
|
1054
|
+
if not query:
|
|
1055
|
+
# query should never be None as it is checked before calling this function.
|
|
1056
|
+
# however, we need to check just in case.
|
|
1057
|
+
self.reporter.warning(
|
|
1058
|
+
title="ODBC Query is null",
|
|
1059
|
+
message="No SQL to parse. Skipping Lineage creation.",
|
|
1060
|
+
context=f"table-name={self.table.full_name}",
|
|
1061
|
+
)
|
|
1062
|
+
return Lineage.empty()
|
|
1063
|
+
|
|
1064
|
+
if self.config.dsn_to_database_schema:
|
|
1065
|
+
value = self.config.dsn_to_database_schema.get(dsn)
|
|
1066
|
+
if value:
|
|
1067
|
+
parts = value.split(".")
|
|
1068
|
+
if len(parts) == 1:
|
|
1069
|
+
database = parts[0]
|
|
1070
|
+
elif len(parts) == 2:
|
|
1071
|
+
database = parts[0]
|
|
1072
|
+
schema = parts[1]
|
|
1073
|
+
|
|
1074
|
+
logger.debug(
|
|
1075
|
+
f"ODBC query processing: dsn={dsn} mapped to database={database}, schema={schema}"
|
|
1076
|
+
)
|
|
1077
|
+
result = self.parse_custom_sql(
|
|
1078
|
+
query=query,
|
|
1079
|
+
server=server_name,
|
|
1080
|
+
database=database,
|
|
1081
|
+
schema=schema,
|
|
1082
|
+
platform_pair=platform_pair,
|
|
1083
|
+
)
|
|
1084
|
+
logger.debug(f"ODBC query lineage generated {len(result.upstreams)} upstreams")
|
|
1085
|
+
return result
|
|
1086
|
+
|
|
1087
|
+
def expression_lineage(
|
|
1088
|
+
self,
|
|
1089
|
+
data_access_func_detail: DataAccessFunctionDetail,
|
|
1090
|
+
data_platform: str,
|
|
1091
|
+
platform_pair: DataPlatformPair,
|
|
1092
|
+
server_name: str,
|
|
1093
|
+
) -> Lineage:
|
|
1009
1094
|
database_name = None
|
|
1010
1095
|
schema_name = None
|
|
1011
1096
|
table_name = None
|
|
@@ -1144,6 +1229,11 @@ class SupportedPattern(Enum):
|
|
|
1144
1229
|
FunctionName.ODBC_DATA_ACCESS,
|
|
1145
1230
|
)
|
|
1146
1231
|
|
|
1232
|
+
ODBC_QUERY = (
|
|
1233
|
+
OdbcLineage,
|
|
1234
|
+
FunctionName.ODBC_QUERY,
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1147
1237
|
def handler(self) -> Type[AbstractLineage]:
|
|
1148
1238
|
return self.value[0]
|
|
1149
1239
|
|
|
@@ -40,6 +40,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
40
40
|
from datahub.ingestion.source.common.subtypes import (
|
|
41
41
|
BIAssetSubTypes,
|
|
42
42
|
BIContainerSubTypes,
|
|
43
|
+
SourceCapabilityModifier,
|
|
43
44
|
)
|
|
44
45
|
from datahub.ingestion.source.powerbi.config import (
|
|
45
46
|
Constant,
|
|
@@ -1229,6 +1230,10 @@ class Mapper:
|
|
|
1229
1230
|
@capability(
|
|
1230
1231
|
SourceCapability.CONTAINERS,
|
|
1231
1232
|
"Enabled by default",
|
|
1233
|
+
subtype_modifier=[
|
|
1234
|
+
SourceCapabilityModifier.POWERBI_WORKSPACE,
|
|
1235
|
+
SourceCapabilityModifier.POWERBI_DATASET,
|
|
1236
|
+
],
|
|
1232
1237
|
)
|
|
1233
1238
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
1234
1239
|
@capability(SourceCapability.OWNERSHIP, "Enabled by default")
|
|
@@ -34,7 +34,13 @@ from datahub.ingestion.api.decorators import (
|
|
|
34
34
|
)
|
|
35
35
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
36
36
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
37
|
-
from datahub.ingestion.source.aws.s3_boto_utils import
|
|
37
|
+
from datahub.ingestion.source.aws.s3_boto_utils import (
|
|
38
|
+
get_s3_tags,
|
|
39
|
+
list_folders,
|
|
40
|
+
list_folders_path,
|
|
41
|
+
list_objects_recursive,
|
|
42
|
+
list_objects_recursive_path,
|
|
43
|
+
)
|
|
38
44
|
from datahub.ingestion.source.aws.s3_util import (
|
|
39
45
|
get_bucket_name,
|
|
40
46
|
get_bucket_relative_path,
|
|
@@ -84,8 +90,6 @@ if TYPE_CHECKING:
|
|
|
84
90
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
85
91
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
86
92
|
|
|
87
|
-
PAGE_SIZE = 1000
|
|
88
|
-
|
|
89
93
|
# Hack to support the .gzip extension with smart_open.
|
|
90
94
|
so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"])
|
|
91
95
|
|
|
@@ -384,7 +388,10 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
384
388
|
|
|
385
389
|
def read_file_spark(self, file: str, ext: str) -> Optional[DataFrame]:
|
|
386
390
|
logger.debug(f"Opening file {file} for profiling in spark")
|
|
387
|
-
|
|
391
|
+
if "s3://" in file:
|
|
392
|
+
# replace s3:// with s3a://, and make sure standalone bucket names always end with a slash.
|
|
393
|
+
# Spark will fail if given a path like `s3a://mybucket`, and requires it to be `s3a://mybucket/`.
|
|
394
|
+
file = f"s3a://{get_bucket_name(file)}/{get_bucket_relative_path(file)}"
|
|
388
395
|
|
|
389
396
|
telemetry.telemetry_instance.ping("data_lake_file", {"extension": ext})
|
|
390
397
|
|
|
@@ -836,29 +843,31 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
836
843
|
content_type=browse_path.content_type,
|
|
837
844
|
)
|
|
838
845
|
|
|
839
|
-
def resolve_templated_folders(self,
|
|
846
|
+
def resolve_templated_folders(self, prefix: str) -> Iterable[str]:
|
|
840
847
|
folder_split: List[str] = prefix.split("*", 1)
|
|
841
848
|
# If the len of split is 1 it means we don't have * in the prefix
|
|
842
849
|
if len(folder_split) == 1:
|
|
843
850
|
yield prefix
|
|
844
851
|
return
|
|
845
852
|
|
|
846
|
-
|
|
847
|
-
|
|
853
|
+
basename_startswith = folder_split[0].split("/")[-1]
|
|
854
|
+
dirname = folder_split[0].removesuffix(basename_startswith)
|
|
855
|
+
|
|
856
|
+
folders = list_folders_path(
|
|
857
|
+
dirname,
|
|
858
|
+
startswith=basename_startswith,
|
|
859
|
+
aws_config=self.source_config.aws_config,
|
|
848
860
|
)
|
|
849
861
|
for folder in folders:
|
|
850
|
-
# Ensure proper path joining -
|
|
851
|
-
# but we need to handle the case where folder_split[1] might
|
|
862
|
+
# Ensure proper path joining - folders from list_folders path never include a
|
|
863
|
+
# trailing slash, but we need to handle the case where folder_split[1] might
|
|
864
|
+
# start with a slash
|
|
852
865
|
remaining_pattern = folder_split[1]
|
|
853
866
|
if remaining_pattern.startswith("/"):
|
|
854
867
|
remaining_pattern = remaining_pattern[1:]
|
|
855
868
|
|
|
856
|
-
# Ensure folder ends with slash for proper path construction
|
|
857
|
-
if not folder.endswith("/"):
|
|
858
|
-
folder = folder + "/"
|
|
859
|
-
|
|
860
869
|
yield from self.resolve_templated_folders(
|
|
861
|
-
|
|
870
|
+
f"{folder.path}/{remaining_pattern}"
|
|
862
871
|
)
|
|
863
872
|
|
|
864
873
|
def get_dir_to_process(
|
|
@@ -942,7 +951,9 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
942
951
|
# Instead of loading all objects into memory, we'll accumulate folder data incrementally
|
|
943
952
|
folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
|
|
944
953
|
|
|
945
|
-
for obj in
|
|
954
|
+
for obj in list_objects_recursive(
|
|
955
|
+
bucket.name, prefix, self.source_config.aws_config
|
|
956
|
+
):
|
|
946
957
|
s3_path = self.create_s3_path(obj.bucket_name, obj.key)
|
|
947
958
|
|
|
948
959
|
if not _is_allowed_path(path_spec, s3_path):
|
|
@@ -1016,13 +1027,6 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1016
1027
|
if self.source_config.aws_config is None:
|
|
1017
1028
|
raise ValueError("aws_config not set. Cannot browse s3")
|
|
1018
1029
|
|
|
1019
|
-
s3 = self.source_config.aws_config.get_s3_resource(
|
|
1020
|
-
self.source_config.verify_ssl
|
|
1021
|
-
)
|
|
1022
|
-
bucket_name = get_bucket_name(path_spec.include)
|
|
1023
|
-
bucket = s3.Bucket(bucket_name)
|
|
1024
|
-
|
|
1025
|
-
logger.debug(f"Scanning bucket: {bucket_name}")
|
|
1026
1030
|
logger.info(f"Processing path spec: {path_spec.include}")
|
|
1027
1031
|
|
|
1028
1032
|
# Check if we have {table} template in the path
|
|
@@ -1034,16 +1038,14 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1034
1038
|
logger.info("Using templated path processing")
|
|
1035
1039
|
# Always use templated processing when {table} is present
|
|
1036
1040
|
# This groups files under table-level datasets
|
|
1037
|
-
yield from self._process_templated_path(path_spec
|
|
1041
|
+
yield from self._process_templated_path(path_spec)
|
|
1038
1042
|
else:
|
|
1039
1043
|
logger.info("Using simple path processing")
|
|
1040
1044
|
# Only use simple processing for non-templated paths
|
|
1041
1045
|
# This creates individual file-level datasets
|
|
1042
|
-
yield from self._process_simple_path(path_spec
|
|
1046
|
+
yield from self._process_simple_path(path_spec)
|
|
1043
1047
|
|
|
1044
|
-
def _process_templated_path(
|
|
1045
|
-
self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
|
|
1046
|
-
) -> Iterable[BrowsePath]:
|
|
1048
|
+
def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]: # noqa: C901
|
|
1047
1049
|
"""
|
|
1048
1050
|
Process S3 paths containing {table} templates to create table-level datasets.
|
|
1049
1051
|
|
|
@@ -1057,12 +1059,17 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1057
1059
|
|
|
1058
1060
|
Args:
|
|
1059
1061
|
path_spec: Path specification with {table} template
|
|
1060
|
-
bucket: S3 bucket resource
|
|
1061
|
-
bucket_name: Name of the S3 bucket
|
|
1062
1062
|
|
|
1063
1063
|
Yields:
|
|
1064
1064
|
BrowsePath: One per table (not per file), containing aggregated metadata
|
|
1065
1065
|
"""
|
|
1066
|
+
|
|
1067
|
+
if self.source_config.aws_config is None:
|
|
1068
|
+
raise ValueError("aws_config not set. Cannot browse s3")
|
|
1069
|
+
s3 = self.source_config.aws_config.get_s3_resource(
|
|
1070
|
+
self.source_config.verify_ssl
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1066
1073
|
# Find the part before {table}
|
|
1067
1074
|
table_marker = "{table}"
|
|
1068
1075
|
if table_marker not in path_spec.include:
|
|
@@ -1097,20 +1104,13 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1097
1104
|
|
|
1098
1105
|
# Split the path at {table} to get the prefix that needs wildcard resolution
|
|
1099
1106
|
prefix_before_table = include.split(table_marker)[0]
|
|
1100
|
-
# Remove the s3:// and bucket name to get the relative path
|
|
1101
|
-
relative_path = get_bucket_relative_path(prefix_before_table)
|
|
1102
|
-
|
|
1103
1107
|
logger.info(f"Prefix before table: {prefix_before_table}")
|
|
1104
|
-
logger.info(f"Relative path for resolution: {relative_path}")
|
|
1105
1108
|
|
|
1106
1109
|
try:
|
|
1107
1110
|
# STEP 2: Resolve ALL wildcards in the path up to {table}
|
|
1108
|
-
# This converts patterns like "data/*/logs/" to actual paths like ["data/2023/logs/", "data/2024/logs/"]
|
|
1109
|
-
table_index = include.find(table_marker)
|
|
1110
|
-
folder_prefix = get_bucket_relative_path(include[:table_index])
|
|
1111
|
-
|
|
1111
|
+
# This converts patterns like "s3://data/*/logs/" to actual paths like ["s3://data/2023/logs/", "s3://data/2024/logs/"]
|
|
1112
1112
|
resolved_prefixes = list(
|
|
1113
|
-
self.resolve_templated_folders(
|
|
1113
|
+
self.resolve_templated_folders(prefix_before_table)
|
|
1114
1114
|
)
|
|
1115
1115
|
logger.info(f"Resolved prefixes: {resolved_prefixes}")
|
|
1116
1116
|
|
|
@@ -1121,20 +1121,22 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1121
1121
|
# Get all folders that could be tables under this resolved prefix
|
|
1122
1122
|
# These are the actual table names (e.g., "users", "events", "logs")
|
|
1123
1123
|
table_folders = list(
|
|
1124
|
-
|
|
1125
|
-
|
|
1124
|
+
list_folders_path(
|
|
1125
|
+
resolved_prefix, aws_config=self.source_config.aws_config
|
|
1126
1126
|
)
|
|
1127
1127
|
)
|
|
1128
1128
|
logger.debug(
|
|
1129
|
-
f"Found table folders under {resolved_prefix}: {table_folders}"
|
|
1129
|
+
f"Found table folders under {resolved_prefix}: {[folder.name for folder in table_folders]}"
|
|
1130
1130
|
)
|
|
1131
1131
|
|
|
1132
1132
|
# STEP 4: Process each table folder to create a table-level dataset
|
|
1133
|
-
for
|
|
1133
|
+
for folder in table_folders:
|
|
1134
|
+
bucket_name = get_bucket_name(folder.path)
|
|
1135
|
+
table_folder = get_bucket_relative_path(folder.path)
|
|
1136
|
+
bucket = s3.Bucket(bucket_name)
|
|
1137
|
+
|
|
1134
1138
|
# Create the full S3 path for this table
|
|
1135
|
-
table_s3_path = self.create_s3_path(
|
|
1136
|
-
bucket_name, table_folder.rstrip("/")
|
|
1137
|
-
)
|
|
1139
|
+
table_s3_path = self.create_s3_path(bucket_name, table_folder)
|
|
1138
1140
|
logger.info(
|
|
1139
1141
|
f"Processing table folder: {table_folder} -> {table_s3_path}"
|
|
1140
1142
|
)
|
|
@@ -1269,17 +1271,16 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1269
1271
|
)
|
|
1270
1272
|
|
|
1271
1273
|
except Exception as e:
|
|
1272
|
-
if
|
|
1274
|
+
if isinstance(e, s3.meta.client.exceptions.NoSuchBucket):
|
|
1273
1275
|
self.get_report().report_warning(
|
|
1274
|
-
"Missing bucket",
|
|
1276
|
+
"Missing bucket",
|
|
1277
|
+
f"No bucket found {e.response['Error'].get('BucketName')}",
|
|
1275
1278
|
)
|
|
1276
1279
|
return
|
|
1277
1280
|
logger.error(f"Error in _process_templated_path: {e}")
|
|
1278
1281
|
raise e
|
|
1279
1282
|
|
|
1280
|
-
def _process_simple_path(
|
|
1281
|
-
self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
|
|
1282
|
-
) -> Iterable[BrowsePath]:
|
|
1283
|
+
def _process_simple_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
|
|
1283
1284
|
"""
|
|
1284
1285
|
Process simple S3 paths without {table} templates to create file-level datasets.
|
|
1285
1286
|
|
|
@@ -1295,8 +1296,6 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1295
1296
|
|
|
1296
1297
|
Args:
|
|
1297
1298
|
path_spec: Path specification without {table} template
|
|
1298
|
-
bucket: S3 bucket resource
|
|
1299
|
-
bucket_name: Name of the S3 bucket
|
|
1300
1299
|
|
|
1301
1300
|
Yields:
|
|
1302
1301
|
BrowsePath: One per file, containing individual file metadata
|
|
@@ -1305,20 +1304,27 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1305
1304
|
- BrowsePath(file="data/file1.csv", size=1000, partitions=[])
|
|
1306
1305
|
- BrowsePath(file="data/file2.csv", size=2000, partitions=[])
|
|
1307
1306
|
"""
|
|
1308
|
-
|
|
1307
|
+
|
|
1308
|
+
if self.source_config.aws_config is None:
|
|
1309
|
+
raise ValueError("aws_config not set")
|
|
1310
|
+
s3 = self.source_config.aws_config.get_s3_resource(
|
|
1311
|
+
self.source_config.verify_ssl
|
|
1312
|
+
)
|
|
1309
1313
|
|
|
1310
1314
|
path_spec.sample_files = False # Disable sampling for simple paths
|
|
1311
1315
|
|
|
1312
1316
|
# Extract the prefix from the path spec (stops at first wildcard)
|
|
1313
|
-
prefix = self.get_prefix(
|
|
1317
|
+
prefix = self.get_prefix(path_spec.include)
|
|
1314
1318
|
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
self.source_config.verify_ssl
|
|
1318
|
-
)
|
|
1319
|
+
basename_startswith = prefix.split("/")[-1]
|
|
1320
|
+
dirname = prefix.removesuffix(basename_startswith)
|
|
1319
1321
|
|
|
1320
1322
|
# Iterate through all objects in the bucket matching the prefix
|
|
1321
|
-
for obj in
|
|
1323
|
+
for obj in list_objects_recursive_path(
|
|
1324
|
+
dirname,
|
|
1325
|
+
startswith=basename_startswith,
|
|
1326
|
+
aws_config=self.source_config.aws_config,
|
|
1327
|
+
):
|
|
1322
1328
|
s3_path = self.create_s3_path(obj.bucket_name, obj.key)
|
|
1323
1329
|
|
|
1324
1330
|
# Get content type if configured
|
|
@@ -216,6 +216,16 @@ class SnowflakeV2Config(
|
|
|
216
216
|
description="If enabled, populates the ingested views' definitions.",
|
|
217
217
|
)
|
|
218
218
|
|
|
219
|
+
fetch_views_from_information_schema: bool = Field(
|
|
220
|
+
default=False,
|
|
221
|
+
description="If enabled, uses information_schema.views to fetch view definitions instead of SHOW VIEWS command. "
|
|
222
|
+
"This alternative method can be more reliable for databases with large numbers of views (> 10K views), as the "
|
|
223
|
+
"SHOW VIEWS approach has proven unreliable and can lead to missing views in such scenarios. However, this method "
|
|
224
|
+
"requires OWNERSHIP privileges on views to retrieve their definitions. For views without ownership permissions "
|
|
225
|
+
"(where VIEW_DEFINITION is null/empty), the system will automatically fall back to using batched SHOW VIEWS queries "
|
|
226
|
+
"to populate the missing definitions.",
|
|
227
|
+
)
|
|
228
|
+
|
|
219
229
|
include_technical_schema: bool = Field(
|
|
220
230
|
default=True,
|
|
221
231
|
description="If enabled, populates the snowflake technical schema and descriptions.",
|
|
@@ -22,6 +22,7 @@ from datahub.ingestion.api.closeable import Closeable
|
|
|
22
22
|
from datahub.ingestion.source.snowflake.constants import (
|
|
23
23
|
CLIENT_PREFETCH_THREADS,
|
|
24
24
|
CLIENT_SESSION_KEEP_ALIVE,
|
|
25
|
+
DEFAULT_SNOWFLAKE_DOMAIN,
|
|
25
26
|
)
|
|
26
27
|
from datahub.ingestion.source.snowflake.oauth_config import (
|
|
27
28
|
OAuthConfiguration,
|
|
@@ -47,8 +48,6 @@ _VALID_AUTH_TYPES: Dict[str, str] = {
|
|
|
47
48
|
"OAUTH_AUTHENTICATOR_TOKEN": OAUTH_AUTHENTICATOR,
|
|
48
49
|
}
|
|
49
50
|
|
|
50
|
-
_SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
|
|
51
|
-
|
|
52
51
|
|
|
53
52
|
class SnowflakePermissionError(MetaError):
|
|
54
53
|
"""A permission error has happened"""
|
|
@@ -110,6 +109,10 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
110
109
|
default=None,
|
|
111
110
|
description="OAuth token from external identity provider. Not recommended for most use cases because it will not be able to refresh once expired.",
|
|
112
111
|
)
|
|
112
|
+
snowflake_domain: str = pydantic.Field(
|
|
113
|
+
default=DEFAULT_SNOWFLAKE_DOMAIN,
|
|
114
|
+
description="Snowflake domain. Use 'snowflakecomputing.com' for most regions or 'snowflakecomputing.cn' for China (cn-northwest-1) region.",
|
|
115
|
+
)
|
|
113
116
|
|
|
114
117
|
def get_account(self) -> str:
|
|
115
118
|
assert self.account_id
|
|
@@ -118,10 +121,13 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
118
121
|
rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id")
|
|
119
122
|
|
|
120
123
|
@pydantic.validator("account_id")
|
|
121
|
-
def validate_account_id(cls, account_id: str) -> str:
|
|
124
|
+
def validate_account_id(cls, account_id: str, values: Dict) -> str:
|
|
122
125
|
account_id = remove_protocol(account_id)
|
|
123
126
|
account_id = remove_trailing_slashes(account_id)
|
|
124
|
-
|
|
127
|
+
# Get the domain from config, fallback to default
|
|
128
|
+
domain = values.get("snowflake_domain", DEFAULT_SNOWFLAKE_DOMAIN)
|
|
129
|
+
snowflake_host_suffix = f".{domain}"
|
|
130
|
+
account_id = remove_suffix(account_id, snowflake_host_suffix)
|
|
125
131
|
return account_id
|
|
126
132
|
|
|
127
133
|
@pydantic.validator("authentication_type", always=True)
|
|
@@ -311,6 +317,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
311
317
|
warehouse=self.warehouse,
|
|
312
318
|
authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
|
|
313
319
|
application=_APPLICATION_NAME,
|
|
320
|
+
host=f"{self.account_id}.{self.snowflake_domain}",
|
|
314
321
|
**connect_args,
|
|
315
322
|
)
|
|
316
323
|
|
|
@@ -324,6 +331,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
324
331
|
role=self.role,
|
|
325
332
|
authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
|
|
326
333
|
application=_APPLICATION_NAME,
|
|
334
|
+
host=f"{self.account_id}.{self.snowflake_domain}",
|
|
327
335
|
**connect_args,
|
|
328
336
|
)
|
|
329
337
|
|
|
@@ -337,6 +345,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
337
345
|
warehouse=self.warehouse,
|
|
338
346
|
role=self.role,
|
|
339
347
|
application=_APPLICATION_NAME,
|
|
348
|
+
host=f"{self.account_id}.{self.snowflake_domain}",
|
|
340
349
|
**connect_args,
|
|
341
350
|
)
|
|
342
351
|
elif self.authentication_type == "OAUTH_AUTHENTICATOR_TOKEN":
|
|
@@ -348,6 +357,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
348
357
|
warehouse=self.warehouse,
|
|
349
358
|
role=self.role,
|
|
350
359
|
application=_APPLICATION_NAME,
|
|
360
|
+
host=f"{self.account_id}.{self.snowflake_domain}",
|
|
351
361
|
**connect_args,
|
|
352
362
|
)
|
|
353
363
|
elif self.authentication_type == "OAUTH_AUTHENTICATOR":
|
|
@@ -363,6 +373,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
363
373
|
role=self.role,
|
|
364
374
|
authenticator=_VALID_AUTH_TYPES.get(self.authentication_type),
|
|
365
375
|
application=_APPLICATION_NAME,
|
|
376
|
+
host=f"{self.account_id}.{self.snowflake_domain}",
|
|
366
377
|
**connect_args,
|
|
367
378
|
)
|
|
368
379
|
else:
|
|
@@ -408,7 +419,7 @@ class SnowflakeConnection(Closeable):
|
|
|
408
419
|
# We often run multiple queries in parallel across multiple threads,
|
|
409
420
|
# so we need to number them to help with log readability.
|
|
410
421
|
query_num = self.get_query_no()
|
|
411
|
-
logger.info(f"Query #{query_num}: {query}", stacklevel=2)
|
|
422
|
+
logger.info(f"Query #{query_num}: {query.rstrip()}", stacklevel=2)
|
|
412
423
|
resp = self._connection.cursor(DictCursor).execute(query)
|
|
413
424
|
if resp is not None and resp.rowcount is not None:
|
|
414
425
|
logger.info(
|
|
@@ -266,6 +266,33 @@ SHOW VIEWS IN DATABASE "{db_name}"
|
|
|
266
266
|
LIMIT {limit} {from_clause};
|
|
267
267
|
"""
|
|
268
268
|
|
|
269
|
+
@staticmethod
|
|
270
|
+
def get_views_for_database(db_name: str) -> str:
|
|
271
|
+
# We've seen some issues with the `SHOW VIEWS` query,
|
|
272
|
+
# particularly when it requires pagination.
|
|
273
|
+
# This is an experimental alternative query that might be more reliable.
|
|
274
|
+
return f"""\
|
|
275
|
+
SELECT
|
|
276
|
+
TABLE_CATALOG as "VIEW_CATALOG",
|
|
277
|
+
TABLE_SCHEMA as "VIEW_SCHEMA",
|
|
278
|
+
TABLE_NAME as "VIEW_NAME",
|
|
279
|
+
COMMENT,
|
|
280
|
+
VIEW_DEFINITION,
|
|
281
|
+
CREATED,
|
|
282
|
+
LAST_ALTERED,
|
|
283
|
+
IS_SECURE
|
|
284
|
+
FROM "{db_name}".information_schema.views
|
|
285
|
+
WHERE TABLE_CATALOG = '{db_name}'
|
|
286
|
+
AND TABLE_SCHEMA != 'INFORMATION_SCHEMA'
|
|
287
|
+
"""
|
|
288
|
+
|
|
289
|
+
@staticmethod
|
|
290
|
+
def get_views_for_schema(db_name: str, schema_name: str) -> str:
|
|
291
|
+
return f"""\
|
|
292
|
+
{SnowflakeQuery.get_views_for_database(db_name).rstrip()}
|
|
293
|
+
AND TABLE_SCHEMA = '{schema_name}'
|
|
294
|
+
"""
|
|
295
|
+
|
|
269
296
|
@staticmethod
|
|
270
297
|
def get_secure_view_definitions() -> str:
|
|
271
298
|
# https://docs.snowflake.com/en/sql-reference/account-usage/views
|
|
@@ -128,6 +128,7 @@ class SnowflakeV2Report(
|
|
|
128
128
|
# "Information schema query returned too much data. Please repeat query with more selective predicates.""
|
|
129
129
|
# This will result in overall increase in time complexity
|
|
130
130
|
num_get_tables_for_schema_queries: int = 0
|
|
131
|
+
num_get_views_for_schema_queries: int = 0
|
|
131
132
|
|
|
132
133
|
# these will be non-zero if the user choses to enable the extract_tags = "with_lineage" option, which requires
|
|
133
134
|
# individual queries per object (database, schema, table) and an extra query per table to get the tags on the columns.
|