acryl-datahub 1.2.0.11rc2__py3-none-any.whl → 1.2.0.11rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (26) hide show
  1. {acryl_datahub-1.2.0.11rc2.dist-info → acryl_datahub-1.2.0.11rc4.dist-info}/METADATA +2609 -2609
  2. {acryl_datahub-1.2.0.11rc2.dist-info → acryl_datahub-1.2.0.11rc4.dist-info}/RECORD +26 -26
  3. datahub/_version.py +1 -1
  4. datahub/cli/docker_cli.py +1 -1
  5. datahub/configuration/common.py +11 -0
  6. datahub/configuration/kafka.py +19 -1
  7. datahub/ingestion/autogenerated/capability_summary.json +2 -2
  8. datahub/ingestion/graph/client.py +7 -7
  9. datahub/ingestion/graph/filters.py +30 -11
  10. datahub/ingestion/source/aws/s3_boto_utils.py +4 -1
  11. datahub/ingestion/source/data_lake_common/path_spec.py +39 -2
  12. datahub/ingestion/source/s3/source.py +125 -164
  13. datahub/ingestion/source/snaplogic/snaplogic.py +4 -4
  14. datahub/ingestion/source/snaplogic/snaplogic_config.py +4 -4
  15. datahub/ingestion/source/snowflake/snowflake_queries.py +23 -7
  16. datahub/ingestion/source/snowflake/snowflake_utils.py +9 -9
  17. datahub/metadata/_internal_schema_classes.py +1 -1
  18. datahub/metadata/schema.avsc +1 -1
  19. datahub/metadata/schemas/CorpUserEditableInfo.avsc +1 -1
  20. datahub/metadata/schemas/MetadataChangeEvent.avsc +1 -1
  21. datahub/specific/aspect_helpers/structured_properties.py +27 -0
  22. datahub/sql_parsing/sqlglot_lineage.py +6 -1
  23. {acryl_datahub-1.2.0.11rc2.dist-info → acryl_datahub-1.2.0.11rc4.dist-info}/WHEEL +0 -0
  24. {acryl_datahub-1.2.0.11rc2.dist-info → acryl_datahub-1.2.0.11rc4.dist-info}/entry_points.txt +0 -0
  25. {acryl_datahub-1.2.0.11rc2.dist-info → acryl_datahub-1.2.0.11rc4.dist-info}/licenses/LICENSE +0 -0
  26. {acryl_datahub-1.2.0.11rc2.dist-info → acryl_datahub-1.2.0.11rc4.dist-info}/top_level.txt +0 -0
@@ -3,14 +3,14 @@ import functools
3
3
  import logging
4
4
  import os
5
5
  import pathlib
6
+ import posixpath
6
7
  import re
7
8
  import time
8
9
  from datetime import datetime
9
10
  from pathlib import PurePath
10
- from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
11
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
11
12
 
12
13
  import smart_open.compression as so_compression
13
- from more_itertools import peekable
14
14
  from pyspark.conf import SparkConf
15
15
  from pyspark.sql import SparkSession
16
16
  from pyspark.sql.dataframe import DataFrame
@@ -36,9 +36,7 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor
36
36
  from datahub.ingestion.api.workunit import MetadataWorkUnit
37
37
  from datahub.ingestion.source.aws.s3_boto_utils import (
38
38
  get_s3_tags,
39
- list_folders,
40
39
  list_folders_path,
41
- list_objects_recursive,
42
40
  list_objects_recursive_path,
43
41
  )
44
42
  from datahub.ingestion.source.aws.s3_util import (
@@ -83,9 +81,6 @@ from datahub.metadata.schema_classes import (
83
81
  from datahub.telemetry import stats, telemetry
84
82
  from datahub.utilities.perf_timer import PerfTimer
85
83
 
86
- if TYPE_CHECKING:
87
- from mypy_boto3_s3.service_resource import Bucket
88
-
89
84
  # hide annoying debug errors from py4j
90
85
  logging.getLogger("py4j").setLevel(logging.ERROR)
91
86
  logger: logging.Logger = logging.getLogger(__name__)
@@ -872,55 +867,62 @@ class S3Source(StatefulIngestionSourceBase):
872
867
 
873
868
  def get_dir_to_process(
874
869
  self,
875
- bucket_name: str,
876
- folder: str,
870
+ uri: str,
877
871
  path_spec: PathSpec,
878
- protocol: str,
879
872
  min: bool = False,
880
873
  ) -> List[str]:
881
- # if len(path_spec.include.split("/")) == len(f"{protocol}{bucket_name}/{folder}".split("/")):
882
- # return [f"{protocol}{bucket_name}/{folder}"]
883
-
884
- iterator = list_folders(
885
- bucket_name=bucket_name,
886
- prefix=folder,
874
+ # Add any remaining parts of the path_spec before globs, excluding the
875
+ # final filename component, to the URI and prefix so that we don't
876
+ # unnecessarily list too many objects.
877
+ if not uri.endswith("/"):
878
+ uri += "/"
879
+ remaining = posixpath.dirname(path_spec.get_remaining_glob_include(uri)).split(
880
+ "*"
881
+ )[0]
882
+ uri += posixpath.dirname(remaining)
883
+ prefix = posixpath.basename(remaining)
884
+
885
+ # Check if we're at the end of the include path. If so, no need to list sub-folders.
886
+ if path_spec.has_correct_number_of_directory_components(uri):
887
+ return [uri]
888
+
889
+ logger.debug(f"get_dir_to_process listing folders {uri=} {prefix=}")
890
+ iterator = list_folders_path(
891
+ s3_uri=uri,
892
+ startswith=prefix,
887
893
  aws_config=self.source_config.aws_config,
888
894
  )
889
- iterator = peekable(iterator)
890
- if iterator:
891
- sorted_dirs = sorted(
892
- iterator,
893
- key=functools.cmp_to_key(partitioned_folder_comparator),
894
- reverse=not min,
895
- )
896
- folders = []
897
- for dir in sorted_dirs:
898
- if path_spec.dir_allowed(f"{protocol}{bucket_name}/{dir}/"):
899
- folders_list = self.get_dir_to_process(
900
- bucket_name=bucket_name,
901
- folder=dir + "/",
902
- path_spec=path_spec,
903
- protocol=protocol,
904
- min=min,
905
- )
906
- folders.extend(folders_list)
907
- if path_spec.traversal_method != FolderTraversalMethod.ALL:
908
- return folders
909
- if folders:
910
- return folders
911
- else:
912
- return [f"{protocol}{bucket_name}/{folder}"]
913
- return [f"{protocol}{bucket_name}/{folder}"]
895
+ sorted_dirs = sorted(
896
+ iterator,
897
+ key=lambda dir: functools.cmp_to_key(partitioned_folder_comparator)(
898
+ dir.name
899
+ ),
900
+ reverse=not min,
901
+ )
902
+ folders = []
903
+ for dir in sorted_dirs:
904
+ if path_spec.dir_allowed(dir.path):
905
+ folders_list = self.get_dir_to_process(
906
+ uri=dir.path,
907
+ path_spec=path_spec,
908
+ min=min,
909
+ )
910
+ folders.extend(folders_list)
911
+ if path_spec.traversal_method != FolderTraversalMethod.ALL:
912
+ return folders
913
+ if folders:
914
+ return folders
915
+ else:
916
+ return [uri]
914
917
 
915
918
  def get_folder_info(
916
919
  self,
917
920
  path_spec: PathSpec,
918
- bucket: "Bucket",
919
- prefix: str,
921
+ uri: str,
920
922
  ) -> Iterable[Folder]:
921
923
  """
922
- Retrieves all the folders in a path by listing all the files in the prefix.
923
- If the prefix is a full path then only that folder will be extracted.
924
+ Retrieves all the folders in a path by recursively listing all the files under the
925
+ given URI.
924
926
 
925
927
  A folder has creation and modification times, size, and a sample file path.
926
928
  - Creation time is the earliest creation time of all files in the folder.
@@ -930,8 +932,7 @@ class S3Source(StatefulIngestionSourceBase):
930
932
 
931
933
  Parameters:
932
934
  path_spec (PathSpec): The path specification used to determine partitioning.
933
- bucket (Bucket): The S3 bucket object.
934
- prefix (str): The prefix path in the S3 bucket to list objects from.
935
+ uri (str): The path in the S3 bucket to list objects from.
935
936
 
936
937
  Returns:
937
938
  List[Folder]: A list of Folder objects representing the partitions found.
@@ -947,12 +948,22 @@ class S3Source(StatefulIngestionSourceBase):
947
948
  self.report.report_file_dropped(s3_uri)
948
949
  return allowed
949
950
 
951
+ # Add any remaining parts of the path_spec before globs to the URI and prefix,
952
+ # so that we don't unnecessarily list too many objects.
953
+ if not uri.endswith("/"):
954
+ uri += "/"
955
+ remaining = path_spec.get_remaining_glob_include(uri).split("*")[0]
956
+ uri += posixpath.dirname(remaining)
957
+ prefix = posixpath.basename(remaining)
958
+
950
959
  # Process objects in a memory-efficient streaming fashion
951
960
  # Instead of loading all objects into memory, we'll accumulate folder data incrementally
952
961
  folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
953
962
 
954
- for obj in list_objects_recursive(
955
- bucket.name, prefix, self.source_config.aws_config
963
+ logger.info(f"Listing objects under {repr(uri)} with {prefix=}")
964
+
965
+ for obj in list_objects_recursive_path(
966
+ uri, startswith=prefix, aws_config=self.source_config.aws_config
956
967
  ):
957
968
  s3_path = self.create_s3_path(obj.bucket_name, obj.key)
958
969
 
@@ -1047,7 +1058,7 @@ class S3Source(StatefulIngestionSourceBase):
1047
1058
  # This creates individual file-level datasets
1048
1059
  yield from self._process_simple_path(path_spec)
1049
1060
 
1050
- def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]: # noqa: C901
1061
+ def _process_templated_path(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
1051
1062
  """
1052
1063
  Process S3 paths containing {table} templates to create table-level datasets.
1053
1064
 
@@ -1133,20 +1144,12 @@ class S3Source(StatefulIngestionSourceBase):
1133
1144
 
1134
1145
  # STEP 4: Process each table folder to create a table-level dataset
1135
1146
  for folder in table_folders:
1136
- bucket_name = get_bucket_name(folder.path)
1137
- table_folder = get_bucket_relative_path(folder.path)
1138
- bucket = s3.Bucket(bucket_name)
1139
-
1140
- # Create the full S3 path for this table
1141
- table_s3_path = self.create_s3_path(bucket_name, table_folder)
1142
- logger.info(
1143
- f"Processing table folder: {table_folder} -> {table_s3_path}"
1144
- )
1147
+ logger.info(f"Processing table path: {folder.path}")
1145
1148
 
1146
1149
  # Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
1147
1150
  # This uses the compiled regex pattern to extract the table name from the full path
1148
1151
  table_name, _ = self.extract_table_name_and_path(
1149
- path_spec, table_s3_path
1152
+ path_spec, folder.path
1150
1153
  )
1151
1154
 
1152
1155
  # Apply table name filtering if configured
@@ -1155,121 +1158,79 @@ class S3Source(StatefulIngestionSourceBase):
1155
1158
  continue
1156
1159
 
1157
1160
  # STEP 5: Handle partition traversal based on configuration
1158
- # Get all partition folders first
1159
- all_partition_folders = list(
1160
- list_folders(
1161
- bucket_name, table_folder, self.source_config.aws_config
1162
- )
1163
- )
1164
- logger.info(
1165
- f"Found {len(all_partition_folders)} partition folders under table {table_name} using method {path_spec.traversal_method}"
1166
- )
1161
+ dirs_to_process = []
1167
1162
 
1168
- if all_partition_folders:
1169
- # Apply the same traversal logic as the original code
1170
- dirs_to_process = []
1163
+ if path_spec.traversal_method == FolderTraversalMethod.ALL:
1164
+ # Process ALL partitions (original behavior)
1165
+ dirs_to_process = [folder.path]
1166
+ logger.debug(
1167
+ f"Processing ALL partition folders under: {folder.path}"
1168
+ )
1171
1169
 
1172
- if path_spec.traversal_method == FolderTraversalMethod.ALL:
1173
- # Process ALL partitions (original behavior)
1174
- dirs_to_process = all_partition_folders
1175
- logger.debug(
1176
- f"Processing ALL {len(all_partition_folders)} partitions"
1170
+ else:
1171
+ # Use the original get_dir_to_process logic for MIN/MAX
1172
+ if (
1173
+ path_spec.traversal_method == FolderTraversalMethod.MIN_MAX
1174
+ or path_spec.traversal_method == FolderTraversalMethod.MAX
1175
+ ):
1176
+ # Get MAX partition using original logic
1177
+ dirs_to_process_max = self.get_dir_to_process(
1178
+ uri=folder.path,
1179
+ path_spec=path_spec,
1180
+ min=False,
1177
1181
  )
1178
-
1179
- else:
1180
- # Use the original get_dir_to_process logic for MIN/MAX
1181
- protocol = "s3://" # Default protocol for S3
1182
-
1183
- if (
1184
- path_spec.traversal_method
1185
- == FolderTraversalMethod.MIN_MAX
1186
- or path_spec.traversal_method
1187
- == FolderTraversalMethod.MAX
1188
- ):
1189
- # Get MAX partition using original logic
1190
- dirs_to_process_max = self.get_dir_to_process(
1191
- bucket_name=bucket_name,
1192
- folder=table_folder + "/",
1193
- path_spec=path_spec,
1194
- protocol=protocol,
1195
- min=False,
1196
- )
1197
- if dirs_to_process_max:
1198
- # Convert full S3 paths back to relative paths for processing
1199
- dirs_to_process.extend(
1200
- [
1201
- d.replace(f"{protocol}{bucket_name}/", "")
1202
- for d in dirs_to_process_max
1203
- ]
1204
- )
1205
- logger.debug(
1206
- f"Added MAX partition: {dirs_to_process_max}"
1207
- )
1208
-
1209
- if (
1210
- path_spec.traversal_method
1211
- == FolderTraversalMethod.MIN_MAX
1212
- ):
1213
- # Get MIN partition using original logic
1214
- dirs_to_process_min = self.get_dir_to_process(
1215
- bucket_name=bucket_name,
1216
- folder=table_folder + "/",
1217
- path_spec=path_spec,
1218
- protocol=protocol,
1219
- min=True,
1182
+ if dirs_to_process_max:
1183
+ dirs_to_process.extend(dirs_to_process_max)
1184
+ logger.debug(
1185
+ f"Added MAX partition: {dirs_to_process_max}"
1220
1186
  )
1221
- if dirs_to_process_min:
1222
- # Convert full S3 paths back to relative paths for processing
1223
- dirs_to_process.extend(
1224
- [
1225
- d.replace(f"{protocol}{bucket_name}/", "")
1226
- for d in dirs_to_process_min
1227
- ]
1228
- )
1229
- logger.debug(
1230
- f"Added MIN partition: {dirs_to_process_min}"
1231
- )
1232
-
1233
- # Process the selected partitions
1234
- all_folders = []
1235
- for partition_folder in dirs_to_process:
1236
- # Ensure we have a clean folder path
1237
- clean_folder = partition_folder.rstrip("/")
1238
-
1239
- logger.info(f"Scanning files in partition: {clean_folder}")
1240
- partition_files = list(
1241
- self.get_folder_info(path_spec, bucket, clean_folder)
1242
- )
1243
- all_folders.extend(partition_files)
1244
1187
 
1245
- if all_folders:
1246
- # Use the most recent file across all processed partitions
1247
- latest_file = max(
1248
- all_folders, key=lambda x: x.modification_time
1188
+ if path_spec.traversal_method == FolderTraversalMethod.MIN_MAX:
1189
+ # Get MIN partition using original logic
1190
+ dirs_to_process_min = self.get_dir_to_process(
1191
+ uri=folder.path,
1192
+ path_spec=path_spec,
1193
+ min=True,
1249
1194
  )
1195
+ if dirs_to_process_min:
1196
+ dirs_to_process.extend(dirs_to_process_min)
1197
+ logger.debug(
1198
+ f"Added MIN partition: {dirs_to_process_min}"
1199
+ )
1250
1200
 
1251
- # Get partition information
1252
- partitions = [f for f in all_folders if f.is_partition]
1201
+ # Process the selected partitions
1202
+ all_folders = []
1203
+ for partition_path in dirs_to_process:
1204
+ logger.info(f"Scanning files in partition: {partition_path}")
1205
+ partition_files = list(
1206
+ self.get_folder_info(path_spec, partition_path)
1207
+ )
1208
+ all_folders.extend(partition_files)
1253
1209
 
1254
- # Calculate total size of processed partitions
1255
- total_size = sum(f.size for f in all_folders)
1210
+ if all_folders:
1211
+ # Use the most recent file across all processed partitions
1212
+ latest_file = max(
1213
+ all_folders, key=lambda x: x.modification_time
1214
+ )
1256
1215
 
1257
- # Create ONE BrowsePath per table
1258
- # The key insight: we need to provide the sample file for schema inference
1259
- # but the table path should be extracted correctly by extract_table_name_and_path
1260
- yield BrowsePath(
1261
- file=latest_file.sample_file, # Sample file for schema inference
1262
- timestamp=latest_file.modification_time, # Latest timestamp
1263
- size=total_size, # Size of processed partitions
1264
- partitions=partitions, # Partition metadata
1265
- )
1266
- else:
1267
- logger.warning(
1268
- f"No files found in processed partitions for table {table_name}"
1269
- )
1216
+ # Get partition information
1217
+ partitions = [f for f in all_folders if f.is_partition]
1218
+
1219
+ # Calculate total size of processed partitions
1220
+ total_size = sum(f.size for f in all_folders)
1221
+
1222
+ # Create ONE BrowsePath per table
1223
+ # The key insight: we need to provide the sample file for schema inference
1224
+ # but the table path should be extracted correctly by extract_table_name_and_path
1225
+ yield BrowsePath(
1226
+ file=latest_file.sample_file, # Sample file for schema inference
1227
+ timestamp=latest_file.modification_time, # Latest timestamp
1228
+ size=total_size, # Size of processed partitions
1229
+ partitions=partitions, # Partition metadata
1230
+ )
1270
1231
  else:
1271
1232
  logger.warning(
1272
- f"No partition folders found under table {table_name}"
1233
+ f"No files found in processed partitions for table {table_name}"
1273
1234
  )
1274
1235
 
1275
1236
  except Exception as e:
@@ -56,12 +56,12 @@ from datahub.metadata.schema_classes import (
56
56
  )
57
57
 
58
58
 
59
- @platform_name("Snaplogic")
59
+ @platform_name("SnapLogic")
60
60
  @config_class(SnaplogicConfig)
61
61
  @support_status(SupportStatus.TESTING)
62
62
  @capability(
63
63
  SourceCapability.PLATFORM_INSTANCE,
64
- "Snaplogic does not support platform instances",
64
+ "SnapLogic does not support platform instances",
65
65
  supported=False,
66
66
  )
67
67
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
@@ -69,7 +69,7 @@ from datahub.metadata.schema_classes import (
69
69
  @capability(SourceCapability.DELETION_DETECTION, "Not supported yet", supported=False)
70
70
  class SnaplogicSource(StatefulIngestionSourceBase):
71
71
  """
72
- A source plugin for ingesting lineage and metadata from Snaplogic.
72
+ A source plugin for ingesting lineage and metadata from SnapLogic.
73
73
  """
74
74
 
75
75
  def __init__(self, config: SnaplogicConfig, ctx: PipelineContext):
@@ -99,7 +99,7 @@ class SnaplogicSource(StatefulIngestionSourceBase):
99
99
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
100
100
  try:
101
101
  self.report.info(
102
- message="Starting lineage ingestion from Snaplogic",
102
+ message="Starting lineage ingestion from SnapLogic",
103
103
  title="Lineage Ingestion",
104
104
  )
105
105
 
@@ -15,14 +15,14 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
15
15
  class SnaplogicConfig(
16
16
  StatefulIngestionConfigBase, StatefulLineageConfigMixin, StatefulUsageConfigMixin
17
17
  ):
18
- platform: str = "Snaplogic"
18
+ platform: str = "SnapLogic"
19
19
  username: str = Field(description="Username")
20
20
  password: SecretStr = Field(description="Password")
21
21
  base_url: str = Field(
22
22
  default="https://elastic.snaplogic.com",
23
- description="Url to your Snaplogic instance: `https://elastic.snaplogic.com`, or similar. Used for making API calls to Snaplogic.",
23
+ description="Url to your SnapLogic instance: `https://elastic.snaplogic.com`, or similar. Used for making API calls to SnapLogic.",
24
24
  )
25
- org_name: str = Field(description="Organization name from Snaplogic instance")
25
+ org_name: str = Field(description="Organization name from SnapLogic instance")
26
26
  namespace_mapping: dict = Field(
27
27
  default={}, description="Mapping of namespaces to platform instances"
28
28
  )
@@ -32,6 +32,6 @@ class SnaplogicConfig(
32
32
  )
33
33
  create_non_snaplogic_datasets: bool = Field(
34
34
  default=False,
35
- description="Whether to create datasets for non-Snaplogic datasets (e.g., databases, S3, etc.)",
35
+ description="Whether to create datasets for non-SnapLogic datasets (e.g., databases, S3, etc.)",
36
36
  )
37
37
  stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None
@@ -297,15 +297,31 @@ class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
297
297
  if use_cached_audit_log:
298
298
  logger.info(f"Using cached audit log at {audit_log_file}")
299
299
  else:
300
- logger.info(f"Fetching audit log into {audit_log_file}")
300
+ # Check if any query-based features are enabled before fetching
301
+ needs_query_data = any(
302
+ [
303
+ self.config.include_lineage,
304
+ self.config.include_queries,
305
+ self.config.include_usage_statistics,
306
+ self.config.include_query_usage_statistics,
307
+ self.config.include_operations,
308
+ ]
309
+ )
310
+
311
+ if not needs_query_data:
312
+ logger.info(
313
+ "All query-based features are disabled. Skipping expensive query log fetch."
314
+ )
315
+ else:
316
+ logger.info(f"Fetching audit log into {audit_log_file}")
301
317
 
302
- with self.report.copy_history_fetch_timer:
303
- for copy_entry in self.fetch_copy_history():
304
- queries.append(copy_entry)
318
+ with self.report.copy_history_fetch_timer:
319
+ for copy_entry in self.fetch_copy_history():
320
+ queries.append(copy_entry)
305
321
 
306
- with self.report.query_log_fetch_timer:
307
- for entry in self.fetch_query_log(users):
308
- queries.append(entry)
322
+ with self.report.query_log_fetch_timer:
323
+ for entry in self.fetch_query_log(users):
324
+ queries.append(entry)
309
325
 
310
326
  stored_proc_tracker: StoredProcLineageTracker = self._exit_stack.enter_context(
311
327
  StoredProcLineageTracker(
@@ -73,16 +73,16 @@ class SnowsightUrlBuilder:
73
73
  url_cloud_provider_suffix = ""
74
74
  else:
75
75
  url_cloud_provider_suffix = f".{cloud}"
76
- if privatelink:
77
- url = f"https://app.{account_locator}.{cloud_region_id}.privatelink.{snowflake_domain}/"
76
+ # Note: Snowsight is always accessed via the public internet (app.snowflake.com)
77
+ # even for accounts using privatelink. Privatelink only applies to database connections,
78
+ # not the Snowsight web UI.
79
+ # Standard Snowsight URL format - works for most regions
80
+ # China region may use app.snowflake.cn instead of app.snowflake.com. This is not documented, just
81
+ # guessing Based on existence of snowflake.cn domain (https://domainindex.com/domains/snowflake.cn)
82
+ if snowflake_domain == "snowflakecomputing.cn":
83
+ url = f"https://app.snowflake.cn/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
78
84
  else:
79
- # Standard Snowsight URL format - works for most regions
80
- # China region may use app.snowflake.cn instead of app.snowflake.com. This is not documented, just
81
- # guessing Based on existence of snowflake.cn domain (https://domainindex.com/domains/snowflake.cn)
82
- if snowflake_domain == "snowflakecomputing.cn":
83
- url = f"https://app.snowflake.cn/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
84
- else:
85
- url = f"https://app.snowflake.com/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
85
+ url = f"https://app.snowflake.com/{cloud_region_id}{url_cloud_provider_suffix}/{account_locator}/"
86
86
  return url
87
87
 
88
88
  @staticmethod
@@ -14040,7 +14040,7 @@ class CorpUserEditableInfoClass(_Aspect):
14040
14040
  else:
14041
14041
  self.skills = skills
14042
14042
  if pictureLink is None:
14043
- # default: 'https://raw.githubusercontent.com/datahub-project/datahub/master/datahub-web-react/src/images/default_avatar.png'
14043
+ # default: 'assets/platforms/default_avatar.png'
14044
14044
  self.pictureLink = self.RECORD_SCHEMA.fields_dict["pictureLink"].default
14045
14045
  else:
14046
14046
  self.pictureLink = pictureLink
@@ -6174,7 +6174,7 @@
6174
6174
  },
6175
6175
  "type": "string",
6176
6176
  "name": "pictureLink",
6177
- "default": "https://raw.githubusercontent.com/datahub-project/datahub/master/datahub-web-react/src/images/default_avatar.png",
6177
+ "default": "assets/platforms/default_avatar.png",
6178
6178
  "doc": "A URL which points to a picture which user wants to set as a profile photo"
6179
6179
  },
6180
6180
  {
@@ -53,7 +53,7 @@
53
53
  },
54
54
  "type": "string",
55
55
  "name": "pictureLink",
56
- "default": "https://raw.githubusercontent.com/datahub-project/datahub/master/datahub-web-react/src/images/default_avatar.png",
56
+ "default": "assets/platforms/default_avatar.png",
57
57
  "doc": "A URL which points to a picture which user wants to set as a profile photo"
58
58
  },
59
59
  {
@@ -1749,7 +1749,7 @@
1749
1749
  },
1750
1750
  "type": "string",
1751
1751
  "name": "pictureLink",
1752
- "default": "https://raw.githubusercontent.com/datahub-project/datahub/master/datahub-web-react/src/images/default_avatar.png",
1752
+ "default": "assets/platforms/default_avatar.png",
1753
1753
  "doc": "A URL which points to a picture which user wants to set as a profile photo"
1754
1754
  },
1755
1755
  {
@@ -70,3 +70,30 @@ class HasStructuredPropertiesPatch(MetadataPatchProposal):
70
70
  ),
71
71
  )
72
72
  return self
73
+
74
+ def set_structured_property_manual(
75
+ self, property: StructuredPropertyValueAssignmentClass
76
+ ) -> Self:
77
+ """Add or update a structured property, using a StructuredPropertyValueAssignmentClass object."""
78
+
79
+ self.remove_structured_property(property.propertyUrn)
80
+ self._add_patch(
81
+ StructuredPropertiesClass.ASPECT_NAME,
82
+ "add",
83
+ path=("properties", property.propertyUrn),
84
+ value=property,
85
+ )
86
+ return self
87
+
88
+ def add_structured_property_manual(
89
+ self, property: StructuredPropertyValueAssignmentClass
90
+ ) -> Self:
91
+ """Add a structured property, using a StructuredPropertyValueAssignmentClass object."""
92
+
93
+ self._add_patch(
94
+ StructuredPropertiesClass.ASPECT_NAME,
95
+ "add",
96
+ path=("properties", property.propertyUrn),
97
+ value=property,
98
+ )
99
+ return self
@@ -1176,7 +1176,12 @@ def _try_extract_select(
1176
1176
  statement = sqlglot.exp.Select().select("*").from_(statement)
1177
1177
  elif isinstance(statement, sqlglot.exp.Insert):
1178
1178
  # TODO Need to map column renames in the expressions part of the statement.
1179
- statement = statement.expression
1179
+ # Preserve CTEs when extracting the SELECT expression from INSERT
1180
+ original_ctes = statement.ctes
1181
+ statement = statement.expression # Get the SELECT expression from the INSERT
1182
+ if isinstance(statement, sqlglot.exp.Query) and original_ctes:
1183
+ for cte in original_ctes:
1184
+ statement = statement.with_(alias=cte.alias, as_=cte.this)
1180
1185
  elif isinstance(statement, sqlglot.exp.Update):
1181
1186
  # Assumption: the output table is already captured in the modified tables list.
1182
1187
  statement = _extract_select_from_update(statement)