acryl-datahub 1.2.0.11rc1__py3-none-any.whl → 1.2.0.11rc3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/METADATA +2557 -2557
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/RECORD +39 -37
- datahub/_version.py +1 -1
- datahub/cli/docker_cli.py +1 -1
- datahub/configuration/common.py +11 -0
- datahub/configuration/kafka.py +19 -1
- datahub/configuration/validate_field_removal.py +3 -0
- datahub/ingestion/autogenerated/capability_summary.json +2 -2
- datahub/ingestion/graph/client.py +7 -7
- datahub/ingestion/graph/filters.py +30 -11
- datahub/ingestion/source/aws/s3_boto_utils.py +4 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +39 -2
- datahub/ingestion/source/looker/looker_common.py +6 -0
- datahub/ingestion/source/looker/looker_constant.py +4 -0
- datahub/ingestion/source/looker/looker_lib_wrapper.py +36 -3
- datahub/ingestion/source/looker/looker_view_id_cache.py +1 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -1
- datahub/ingestion/source/looker/lookml_config.py +30 -2
- datahub/ingestion/source/looker/lookml_refinement.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +42 -29
- datahub/ingestion/source/looker/view_upstream.py +494 -1
- datahub/ingestion/source/s3/source.py +125 -164
- datahub/ingestion/source/snaplogic/snaplogic.py +4 -4
- datahub/ingestion/source/snaplogic/snaplogic_config.py +4 -4
- datahub/ingestion/source/snowflake/snowflake_utils.py +9 -9
- datahub/metadata/_internal_schema_classes.py +1 -1
- datahub/metadata/schema.avsc +1 -1
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +1 -1
- datahub/sdk/search_filters.py +122 -1
- datahub/secret/datahub_secret_store.py +3 -0
- datahub/secret/environment_secret_store.py +29 -0
- datahub/secret/file_secret_store.py +49 -0
- datahub/specific/aspect_helpers/structured_properties.py +27 -0
- datahub/sql_parsing/sqlglot_lineage.py +6 -1
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.11rc1.dist-info → acryl_datahub-1.2.0.11rc3.dist-info}/top_level.txt +0 -0
|
@@ -3,14 +3,14 @@ import functools
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import pathlib
|
|
6
|
+
import posixpath
|
|
6
7
|
import re
|
|
7
8
|
import time
|
|
8
9
|
from datetime import datetime
|
|
9
10
|
from pathlib import PurePath
|
|
10
|
-
from typing import
|
|
11
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
11
12
|
|
|
12
13
|
import smart_open.compression as so_compression
|
|
13
|
-
from more_itertools import peekable
|
|
14
14
|
from pyspark.conf import SparkConf
|
|
15
15
|
from pyspark.sql import SparkSession
|
|
16
16
|
from pyspark.sql.dataframe import DataFrame
|
|
@@ -36,9 +36,7 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
|
36
36
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
37
37
|
from datahub.ingestion.source.aws.s3_boto_utils import (
|
|
38
38
|
get_s3_tags,
|
|
39
|
-
list_folders,
|
|
40
39
|
list_folders_path,
|
|
41
|
-
list_objects_recursive,
|
|
42
40
|
list_objects_recursive_path,
|
|
43
41
|
)
|
|
44
42
|
from datahub.ingestion.source.aws.s3_util import (
|
|
@@ -83,9 +81,6 @@ from datahub.metadata.schema_classes import (
|
|
|
83
81
|
from datahub.telemetry import stats, telemetry
|
|
84
82
|
from datahub.utilities.perf_timer import PerfTimer
|
|
85
83
|
|
|
86
|
-
if TYPE_CHECKING:
|
|
87
|
-
from mypy_boto3_s3.service_resource import Bucket
|
|
88
|
-
|
|
89
84
|
# hide annoying debug errors from py4j
|
|
90
85
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
91
86
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
@@ -872,55 +867,62 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
872
867
|
|
|
873
868
|
def get_dir_to_process(
|
|
874
869
|
self,
|
|
875
|
-
|
|
876
|
-
folder: str,
|
|
870
|
+
uri: str,
|
|
877
871
|
path_spec: PathSpec,
|
|
878
|
-
protocol: str,
|
|
879
872
|
min: bool = False,
|
|
880
873
|
) -> List[str]:
|
|
881
|
-
#
|
|
882
|
-
#
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
874
|
+
# Add any remaining parts of the path_spec before globs, excluding the
|
|
875
|
+
# final filename component, to the URI and prefix so that we don't
|
|
876
|
+
# unnecessarily list too many objects.
|
|
877
|
+
if not uri.endswith("/"):
|
|
878
|
+
uri += "/"
|
|
879
|
+
remaining = posixpath.dirname(path_spec.get_remaining_glob_include(uri)).split(
|
|
880
|
+
"*"
|
|
881
|
+
)[0]
|
|
882
|
+
uri += posixpath.dirname(remaining)
|
|
883
|
+
prefix = posixpath.basename(remaining)
|
|
884
|
+
|
|
885
|
+
# Check if we're at the end of the include path. If so, no need to list sub-folders.
|
|
886
|
+
if path_spec.has_correct_number_of_directory_components(uri):
|
|
887
|
+
return [uri]
|
|
888
|
+
|
|
889
|
+
logger.debug(f"get_dir_to_process listing folders {uri=} {prefix=}")
|
|
890
|
+
iterator = list_folders_path(
|
|
891
|
+
s3_uri=uri,
|
|
892
|
+
startswith=prefix,
|
|
887
893
|
aws_config=self.source_config.aws_config,
|
|
888
894
|
)
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
folders
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
else:
|
|
912
|
-
return [f"{protocol}{bucket_name}/{folder}"]
|
|
913
|
-
return [f"{protocol}{bucket_name}/{folder}"]
|
|
895
|
+
sorted_dirs = sorted(
|
|
896
|
+
iterator,
|
|
897
|
+
key=lambda dir: functools.cmp_to_key(partitioned_folder_comparator)(
|
|
898
|
+
dir.name
|
|
899
|
+
),
|
|
900
|
+
reverse=not min,
|
|
901
|
+
)
|
|
902
|
+
folders = []
|
|
903
|
+
for dir in sorted_dirs:
|
|
904
|
+
if path_spec.dir_allowed(dir.path):
|
|
905
|
+
folders_list = self.get_dir_to_process(
|
|
906
|
+
uri=dir.path,
|
|
907
|
+
path_spec=path_spec,
|
|
908
|
+
min=min,
|
|
909
|
+
)
|
|
910
|
+
folders.extend(folders_list)
|
|
911
|
+
if path_spec.traversal_method != FolderTraversalMethod.ALL:
|
|
912
|
+
return folders
|
|
913
|
+
if folders:
|
|
914
|
+
return folders
|
|
915
|
+
else:
|
|
916
|
+
return [uri]
|
|
914
917
|
|
|
915
918
|
def get_folder_info(
|
|
916
919
|
self,
|
|
917
920
|
path_spec: PathSpec,
|
|
918
|
-
|
|
919
|
-
prefix: str,
|
|
921
|
+
uri: str,
|
|
920
922
|
) -> Iterable[Folder]:
|
|
921
923
|
"""
|
|
922
|
-
Retrieves all the folders in a path by listing all the files
|
|
923
|
-
|
|
924
|
+
Retrieves all the folders in a path by recursively listing all the files under the
|
|
925
|
+
given URI.
|
|
924
926
|
|
|
925
927
|
A folder has creation and modification times, size, and a sample file path.
|
|
926
928
|
- Creation time is the earliest creation time of all files in the folder.
|
|
@@ -930,8 +932,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
930
932
|
|
|
931
933
|
Parameters:
|
|
932
934
|
path_spec (PathSpec): The path specification used to determine partitioning.
|
|
933
|
-
|
|
934
|
-
prefix (str): The prefix path in the S3 bucket to list objects from.
|
|
935
|
+
uri (str): The path in the S3 bucket to list objects from.
|
|
935
936
|
|
|
936
937
|
Returns:
|
|
937
938
|
List[Folder]: A list of Folder objects representing the partitions found.
|
|
@@ -947,12 +948,22 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
947
948
|
self.report.report_file_dropped(s3_uri)
|
|
948
949
|
return allowed
|
|
949
950
|
|
|
951
|
+
# Add any remaining parts of the path_spec before globs to the URI and prefix,
|
|
952
|
+
# so that we don't unnecessarily list too many objects.
|
|
953
|
+
if not uri.endswith("/"):
|
|
954
|
+
uri += "/"
|
|
955
|
+
remaining = path_spec.get_remaining_glob_include(uri).split("*")[0]
|
|
956
|
+
uri += posixpath.dirname(remaining)
|
|
957
|
+
prefix = posixpath.basename(remaining)
|
|
958
|
+
|
|
950
959
|
# Process objects in a memory-efficient streaming fashion
|
|
951
960
|
# Instead of loading all objects into memory, we'll accumulate folder data incrementally
|
|
952
961
|
folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
|
|
953
962
|
|
|
954
|
-
|
|
955
|
-
|
|
963
|
+
logger.info(f"Listing objects under {repr(uri)} with {prefix=}")
|
|
964
|
+
|
|
965
|
+
for obj in list_objects_recursive_path(
|
|
966
|
+
uri, startswith=prefix, aws_config=self.source_config.aws_config
|
|
956
967
|
):
|
|
957
968
|
s3_path = self.create_s3_path(obj.bucket_name, obj.key)
|
|
958
969
|
|
|
@@ -1047,7 +1058,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1047
1058
|
# This creates individual file-level datasets
|
|
1048
1059
|
yield from self._process_simple_path(path_spec)
|
|
1049
1060
|
|
|
1050
|
-
def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
|
|
1061
|
+
def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
|
|
1051
1062
|
"""
|
|
1052
1063
|
Process S3 paths containing {table} templates to create table-level datasets.
|
|
1053
1064
|
|
|
@@ -1133,20 +1144,12 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1133
1144
|
|
|
1134
1145
|
# STEP 4: Process each table folder to create a table-level dataset
|
|
1135
1146
|
for folder in table_folders:
|
|
1136
|
-
|
|
1137
|
-
table_folder = get_bucket_relative_path(folder.path)
|
|
1138
|
-
bucket = s3.Bucket(bucket_name)
|
|
1139
|
-
|
|
1140
|
-
# Create the full S3 path for this table
|
|
1141
|
-
table_s3_path = self.create_s3_path(bucket_name, table_folder)
|
|
1142
|
-
logger.info(
|
|
1143
|
-
f"Processing table folder: {table_folder} -> {table_s3_path}"
|
|
1144
|
-
)
|
|
1147
|
+
logger.info(f"Processing table path: {folder.path}")
|
|
1145
1148
|
|
|
1146
1149
|
# Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
|
|
1147
1150
|
# This uses the compiled regex pattern to extract the table name from the full path
|
|
1148
1151
|
table_name, _ = self.extract_table_name_and_path(
|
|
1149
|
-
path_spec,
|
|
1152
|
+
path_spec, folder.path
|
|
1150
1153
|
)
|
|
1151
1154
|
|
|
1152
1155
|
# Apply table name filtering if configured
|
|
@@ -1155,121 +1158,79 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1155
1158
|
continue
|
|
1156
1159
|
|
|
1157
1160
|
# STEP 5: Handle partition traversal based on configuration
|
|
1158
|
-
|
|
1159
|
-
all_partition_folders = list(
|
|
1160
|
-
list_folders(
|
|
1161
|
-
bucket_name, table_folder, self.source_config.aws_config
|
|
1162
|
-
)
|
|
1163
|
-
)
|
|
1164
|
-
logger.info(
|
|
1165
|
-
f"Found {len(all_partition_folders)} partition folders under table {table_name} using method {path_spec.traversal_method}"
|
|
1166
|
-
)
|
|
1161
|
+
dirs_to_process = []
|
|
1167
1162
|
|
|
1168
|
-
if
|
|
1169
|
-
#
|
|
1170
|
-
dirs_to_process = []
|
|
1163
|
+
if path_spec.traversal_method == FolderTraversalMethod.ALL:
|
|
1164
|
+
# Process ALL partitions (original behavior)
|
|
1165
|
+
dirs_to_process = [folder.path]
|
|
1166
|
+
logger.debug(
|
|
1167
|
+
f"Processing ALL partition folders under: {folder.path}"
|
|
1168
|
+
)
|
|
1171
1169
|
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1170
|
+
else:
|
|
1171
|
+
# Use the original get_dir_to_process logic for MIN/MAX
|
|
1172
|
+
if (
|
|
1173
|
+
path_spec.traversal_method == FolderTraversalMethod.MIN_MAX
|
|
1174
|
+
or path_spec.traversal_method == FolderTraversalMethod.MAX
|
|
1175
|
+
):
|
|
1176
|
+
# Get MAX partition using original logic
|
|
1177
|
+
dirs_to_process_max = self.get_dir_to_process(
|
|
1178
|
+
uri=folder.path,
|
|
1179
|
+
path_spec=path_spec,
|
|
1180
|
+
min=False,
|
|
1177
1181
|
)
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
if (
|
|
1184
|
-
path_spec.traversal_method
|
|
1185
|
-
== FolderTraversalMethod.MIN_MAX
|
|
1186
|
-
or path_spec.traversal_method
|
|
1187
|
-
== FolderTraversalMethod.MAX
|
|
1188
|
-
):
|
|
1189
|
-
# Get MAX partition using original logic
|
|
1190
|
-
dirs_to_process_max = self.get_dir_to_process(
|
|
1191
|
-
bucket_name=bucket_name,
|
|
1192
|
-
folder=table_folder + "/",
|
|
1193
|
-
path_spec=path_spec,
|
|
1194
|
-
protocol=protocol,
|
|
1195
|
-
min=False,
|
|
1196
|
-
)
|
|
1197
|
-
if dirs_to_process_max:
|
|
1198
|
-
# Convert full S3 paths back to relative paths for processing
|
|
1199
|
-
dirs_to_process.extend(
|
|
1200
|
-
[
|
|
1201
|
-
d.replace(f"{protocol}{bucket_name}/", "")
|
|
1202
|
-
for d in dirs_to_process_max
|
|
1203
|
-
]
|
|
1204
|
-
)
|
|
1205
|
-
logger.debug(
|
|
1206
|
-
f"Added MAX partition: {dirs_to_process_max}"
|
|
1207
|
-
)
|
|
1208
|
-
|
|
1209
|
-
if (
|
|
1210
|
-
path_spec.traversal_method
|
|
1211
|
-
== FolderTraversalMethod.MIN_MAX
|
|
1212
|
-
):
|
|
1213
|
-
# Get MIN partition using original logic
|
|
1214
|
-
dirs_to_process_min = self.get_dir_to_process(
|
|
1215
|
-
bucket_name=bucket_name,
|
|
1216
|
-
folder=table_folder + "/",
|
|
1217
|
-
path_spec=path_spec,
|
|
1218
|
-
protocol=protocol,
|
|
1219
|
-
min=True,
|
|
1182
|
+
if dirs_to_process_max:
|
|
1183
|
+
dirs_to_process.extend(dirs_to_process_max)
|
|
1184
|
+
logger.debug(
|
|
1185
|
+
f"Added MAX partition: {dirs_to_process_max}"
|
|
1220
1186
|
)
|
|
1221
|
-
if dirs_to_process_min:
|
|
1222
|
-
# Convert full S3 paths back to relative paths for processing
|
|
1223
|
-
dirs_to_process.extend(
|
|
1224
|
-
[
|
|
1225
|
-
d.replace(f"{protocol}{bucket_name}/", "")
|
|
1226
|
-
for d in dirs_to_process_min
|
|
1227
|
-
]
|
|
1228
|
-
)
|
|
1229
|
-
logger.debug(
|
|
1230
|
-
f"Added MIN partition: {dirs_to_process_min}"
|
|
1231
|
-
)
|
|
1232
|
-
|
|
1233
|
-
# Process the selected partitions
|
|
1234
|
-
all_folders = []
|
|
1235
|
-
for partition_folder in dirs_to_process:
|
|
1236
|
-
# Ensure we have a clean folder path
|
|
1237
|
-
clean_folder = partition_folder.rstrip("/")
|
|
1238
|
-
|
|
1239
|
-
logger.info(f"Scanning files in partition: {clean_folder}")
|
|
1240
|
-
partition_files = list(
|
|
1241
|
-
self.get_folder_info(path_spec, bucket, clean_folder)
|
|
1242
|
-
)
|
|
1243
|
-
all_folders.extend(partition_files)
|
|
1244
1187
|
|
|
1245
|
-
if
|
|
1246
|
-
#
|
|
1247
|
-
|
|
1248
|
-
|
|
1188
|
+
if path_spec.traversal_method == FolderTraversalMethod.MIN_MAX:
|
|
1189
|
+
# Get MIN partition using original logic
|
|
1190
|
+
dirs_to_process_min = self.get_dir_to_process(
|
|
1191
|
+
uri=folder.path,
|
|
1192
|
+
path_spec=path_spec,
|
|
1193
|
+
min=True,
|
|
1249
1194
|
)
|
|
1195
|
+
if dirs_to_process_min:
|
|
1196
|
+
dirs_to_process.extend(dirs_to_process_min)
|
|
1197
|
+
logger.debug(
|
|
1198
|
+
f"Added MIN partition: {dirs_to_process_min}"
|
|
1199
|
+
)
|
|
1250
1200
|
|
|
1251
|
-
|
|
1252
|
-
|
|
1201
|
+
# Process the selected partitions
|
|
1202
|
+
all_folders = []
|
|
1203
|
+
for partition_path in dirs_to_process:
|
|
1204
|
+
logger.info(f"Scanning files in partition: {partition_path}")
|
|
1205
|
+
partition_files = list(
|
|
1206
|
+
self.get_folder_info(path_spec, partition_path)
|
|
1207
|
+
)
|
|
1208
|
+
all_folders.extend(partition_files)
|
|
1253
1209
|
|
|
1254
|
-
|
|
1255
|
-
|
|
1210
|
+
if all_folders:
|
|
1211
|
+
# Use the most recent file across all processed partitions
|
|
1212
|
+
latest_file = max(
|
|
1213
|
+
all_folders, key=lambda x: x.modification_time
|
|
1214
|
+
)
|
|
1256
1215
|
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1216
|
+
# Get partition information
|
|
1217
|
+
partitions = [f for f in all_folders if f.is_partition]
|
|
1218
|
+
|
|
1219
|
+
# Calculate total size of processed partitions
|
|
1220
|
+
total_size = sum(f.size for f in all_folders)
|
|
1221
|
+
|
|
1222
|
+
# Create ONE BrowsePath per table
|
|
1223
|
+
# The key insight: we need to provide the sample file for schema inference
|
|
1224
|
+
# but the table path should be extracted correctly by extract_table_name_and_path
|
|
1225
|
+
yield BrowsePath(
|
|
1226
|
+
file=latest_file.sample_file, # Sample file for schema inference
|
|
1227
|
+
timestamp=latest_file.modification_time, # Latest timestamp
|
|
1228
|
+
size=total_size, # Size of processed partitions
|
|
1229
|
+
partitions=partitions, # Partition metadata
|
|
1230
|
+
)
|
|
1270
1231
|
else:
|
|
1271
1232
|
logger.warning(
|
|
1272
|
-
f"No
|
|
1233
|
+
f"No files found in processed partitions for table {table_name}"
|
|
1273
1234
|
)
|
|
1274
1235
|
|
|
1275
1236
|
except Exception as e:
|
|
@@ -56,12 +56,12 @@ from datahub.metadata.schema_classes import (
|
|
|
56
56
|
)
|
|
57
57
|
|
|
58
58
|
|
|
59
|
-
@platform_name("
|
|
59
|
+
@platform_name("SnapLogic")
|
|
60
60
|
@config_class(SnaplogicConfig)
|
|
61
61
|
@support_status(SupportStatus.TESTING)
|
|
62
62
|
@capability(
|
|
63
63
|
SourceCapability.PLATFORM_INSTANCE,
|
|
64
|
-
"
|
|
64
|
+
"SnapLogic does not support platform instances",
|
|
65
65
|
supported=False,
|
|
66
66
|
)
|
|
67
67
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
@@ -69,7 +69,7 @@ from datahub.metadata.schema_classes import (
|
|
|
69
69
|
@capability(SourceCapability.DELETION_DETECTION, "Not supported yet", supported=False)
|
|
70
70
|
class SnaplogicSource(StatefulIngestionSourceBase):
|
|
71
71
|
"""
|
|
72
|
-
A source plugin for ingesting lineage and metadata from
|
|
72
|
+
A source plugin for ingesting lineage and metadata from SnapLogic.
|
|
73
73
|
"""
|
|
74
74
|
|
|
75
75
|
def __init__(self, config: SnaplogicConfig, ctx: PipelineContext):
|
|
@@ -99,7 +99,7 @@ class SnaplogicSource(StatefulIngestionSourceBase):
|
|
|
99
99
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
100
100
|
try:
|
|
101
101
|
self.report.info(
|
|
102
|
-
message="Starting lineage ingestion from
|
|
102
|
+
message="Starting lineage ingestion from SnapLogic",
|
|
103
103
|
title="Lineage Ingestion",
|
|
104
104
|
)
|
|
105
105
|
|
|
@@ -15,14 +15,14 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
15
15
|
class SnaplogicConfig(
|
|
16
16
|
StatefulIngestionConfigBase, StatefulLineageConfigMixin, StatefulUsageConfigMixin
|
|
17
17
|
):
|
|
18
|
-
platform: str = "
|
|
18
|
+
platform: str = "SnapLogic"
|
|
19
19
|
username: str = Field(description="Username")
|
|
20
20
|
password: SecretStr = Field(description="Password")
|
|
21
21
|
base_url: str = Field(
|
|
22
22
|
default="https://elastic.snaplogic.com",
|
|
23
|
-
description="Url to your
|
|
23
|
+
description="Url to your SnapLogic instance: `https://elastic.snaplogic.com`, or similar. Used for making API calls to SnapLogic.",
|
|
24
24
|
)
|
|
25
|
-
org_name: str = Field(description="Organization name from
|
|
25
|
+
org_name: str = Field(description="Organization name from SnapLogic instance")
|
|
26
26
|
namespace_mapping: dict = Field(
|
|
27
27
|
default={}, description="Mapping of namespaces to platform instances"
|
|
28
28
|
)
|
|
@@ -32,6 +32,6 @@ class SnaplogicConfig(
|
|
|
32
32
|
)
|
|
33
33
|
create_non_snaplogic_datasets: bool = Field(
|
|
34
34
|
default=False,
|
|
35
|
-
description="Whether to create datasets for non-
|
|
35
|
+
description="Whether to create datasets for non-SnapLogic datasets (e.g., databases, S3, etc.)",
|
|
36
36
|
)
|
|
37
37
|
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
|
|
@@ -73,16 +73,16 @@ class SnowsightUrlBuilder:
|
|
|
73
73
|
url_cloud_provider_suffix = ""
|
|
74
74
|
else:
|
|
75
75
|
url_cloud_provider_suffix = f".{cloud}"
|
|
76
|
-
|
|
77
|
-
|
|
76
|
+
# Note: Snowsight is always accessed via the public internet (app.snowflake.com)
|
|
77
|
+
# even for accounts using privatelink. Privatelink only applies to database connections,
|
|
78
|
+
# not the Snowsight web UI.
|
|
79
|
+
# Standard Snowsight URL format - works for most regions
|
|
80
|
+
# China region may use app.snowflake.cn instead of app.snowflake.com. This is not documented, just
|
|
81
|
+
# guessing Based on existence of snowflake.cn domain (https://domainindex.com/domains/snowflake.cn)
|
|
82
|
+
if snowflake_domain == "snowflakecomputing.cn":
|
|
83
|
+
url = f"https://app.snowflake.cn/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
|
|
78
84
|
else:
|
|
79
|
-
|
|
80
|
-
# China region may use app.snowflake.cn instead of app.snowflake.com. This is not documented, just
|
|
81
|
-
# guessing Based on existence of snowflake.cn domain (https://domainindex.com/domains/snowflake.cn)
|
|
82
|
-
if snowflake_domain == "snowflakecomputing.cn":
|
|
83
|
-
url = f"https://app.snowflake.cn/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
|
|
84
|
-
else:
|
|
85
|
-
url = f"https://app.snowflake.com/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
|
|
85
|
+
url = f"https://app.snowflake.com/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
|
|
86
86
|
return url
|
|
87
87
|
|
|
88
88
|
@staticmethod
|
|
@@ -14040,7 +14040,7 @@ class CorpUserEditableInfoClass(_Aspect):
|
|
|
14040
14040
|
else:
|
|
14041
14041
|
self.skills = skills
|
|
14042
14042
|
if pictureLink is None:
|
|
14043
|
-
# default: '
|
|
14043
|
+
# default: 'assets/platforms/default_avatar.png'
|
|
14044
14044
|
self.pictureLink = self.RECORD_SCHEMA.fields_dict["pictureLink"].default
|
|
14045
14045
|
else:
|
|
14046
14046
|
self.pictureLink = pictureLink
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -6174,7 +6174,7 @@
|
|
|
6174
6174
|
},
|
|
6175
6175
|
"type": "string",
|
|
6176
6176
|
"name": "pictureLink",
|
|
6177
|
-
"default": "
|
|
6177
|
+
"default": "assets/platforms/default_avatar.png",
|
|
6178
6178
|
"doc": "A URL which points to a picture which user wants to set as a profile photo"
|
|
6179
6179
|
},
|
|
6180
6180
|
{
|
|
@@ -53,7 +53,7 @@
|
|
|
53
53
|
},
|
|
54
54
|
"type": "string",
|
|
55
55
|
"name": "pictureLink",
|
|
56
|
-
"default": "
|
|
56
|
+
"default": "assets/platforms/default_avatar.png",
|
|
57
57
|
"doc": "A URL which points to a picture which user wants to set as a profile photo"
|
|
58
58
|
},
|
|
59
59
|
{
|
|
@@ -1749,7 +1749,7 @@
|
|
|
1749
1749
|
},
|
|
1750
1750
|
"type": "string",
|
|
1751
1751
|
"name": "pictureLink",
|
|
1752
|
-
"default": "
|
|
1752
|
+
"default": "assets/platforms/default_avatar.png",
|
|
1753
1753
|
"doc": "A URL which points to a picture which user wants to set as a profile photo"
|
|
1754
1754
|
},
|
|
1755
1755
|
{
|