acryl-datahub 1.1.0.5rc4__py3-none-any.whl → 1.1.0.5rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,19 +1,21 @@
1
1
  import dataclasses
2
2
  from dataclasses import field as dataclass_field
3
- from typing import List
4
3
 
5
4
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
6
5
  StaleEntityRemovalSourceReport,
7
6
  )
7
+ from datahub.utilities.lossy_collections import LossyList
8
8
 
9
9
 
10
10
  @dataclasses.dataclass
11
11
  class DataLakeSourceReport(StaleEntityRemovalSourceReport):
12
12
  files_scanned = 0
13
- filtered: List[str] = dataclass_field(default_factory=list)
13
+ filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
14
+ number_of_files_filtered: int = 0
14
15
 
15
16
  def report_file_scanned(self) -> None:
16
17
  self.files_scanned += 1
17
18
 
18
19
  def report_file_dropped(self, file: str) -> None:
19
20
  self.filtered.append(file)
21
+ self.number_of_files_filtered += 1
@@ -8,7 +8,6 @@ import time
8
8
  from datetime import datetime
9
9
  from pathlib import PurePath
10
10
  from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
11
- from urllib.parse import urlparse
12
11
 
13
12
  import smart_open.compression as so_compression
14
13
  from more_itertools import peekable
@@ -75,7 +74,6 @@ from datahub.metadata.schema_classes import (
75
74
  _Aspect,
76
75
  )
77
76
  from datahub.telemetry import stats, telemetry
78
- from datahub.utilities.groupby import groupby_unsorted
79
77
  from datahub.utilities.perf_timer import PerfTimer
80
78
 
81
79
  if TYPE_CHECKING:
@@ -162,6 +160,15 @@ class Folder:
162
160
  )
163
161
 
164
162
 
163
+ @dataclasses.dataclass
164
+ class FolderInfo:
165
+ objects: List[Any]
166
+ total_size: int
167
+ min_time: datetime
168
+ max_time: datetime
169
+ latest_obj: Any
170
+
171
+
165
172
  @dataclasses.dataclass
166
173
  class BrowsePath:
167
174
  file: str
@@ -860,8 +867,18 @@ class S3Source(StatefulIngestionSourceBase):
860
867
  bucket_name, folder_split[0], self.source_config.aws_config
861
868
  )
862
869
  for folder in folders:
870
+ # Ensure proper path joining - folder already includes trailing slash from list_folders
871
+ # but we need to handle the case where folder_split[1] might start with a slash
872
+ remaining_pattern = folder_split[1]
873
+ if remaining_pattern.startswith("/"):
874
+ remaining_pattern = remaining_pattern[1:]
875
+
876
+ # Ensure folder ends with slash for proper path construction
877
+ if not folder.endswith("/"):
878
+ folder = folder + "/"
879
+
863
880
  yield from self.resolve_templated_folders(
864
- bucket_name, f"{folder}{folder_split[1]}"
881
+ bucket_name, f"{folder}{remaining_pattern}"
865
882
  )
866
883
 
867
884
  def get_dir_to_process(
@@ -938,20 +955,47 @@ class S3Source(StatefulIngestionSourceBase):
938
955
  self.report.report_file_dropped(s3_uri)
939
956
  return allowed
940
957
 
941
- s3_objects = (
942
- obj
943
- for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
944
- if _is_allowed_path(
945
- path_spec, self.create_s3_path(obj.bucket_name, obj.key)
958
+ # Process objects in a memory-efficient streaming fashion
959
+ # Instead of loading all objects into memory, we'll accumulate folder data incrementally
960
+ folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
961
+
962
+ for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
963
+ s3_path = self.create_s3_path(obj.bucket_name, obj.key)
964
+
965
+ if not _is_allowed_path(path_spec, s3_path):
966
+ continue
967
+
968
+ # Extract the directory name (folder) from the object key
969
+ dirname = obj.key.rsplit("/", 1)[0]
970
+
971
+ # Initialize folder data if we haven't seen this directory before
972
+ if dirname not in folder_data:
973
+ folder_data[dirname] = FolderInfo(
974
+ objects=[],
975
+ total_size=0,
976
+ min_time=obj.last_modified,
977
+ max_time=obj.last_modified,
978
+ latest_obj=obj,
979
+ )
980
+
981
+ # Update folder statistics incrementally
982
+ folder_info = folder_data[dirname]
983
+ folder_info.objects.append(obj)
984
+ folder_info.total_size += obj.size
985
+
986
+ # Track min/max times and latest object
987
+ if obj.last_modified < folder_info.min_time:
988
+ folder_info.min_time = obj.last_modified
989
+ if obj.last_modified > folder_info.max_time:
990
+ folder_info.max_time = obj.last_modified
991
+ folder_info.latest_obj = obj
992
+
993
+ # Yield folders after processing all objects
994
+ for _dirname, folder_info in folder_data.items():
995
+ latest_obj = folder_info.latest_obj
996
+ max_file_s3_path = self.create_s3_path(
997
+ latest_obj.bucket_name, latest_obj.key
946
998
  )
947
- )
948
- grouped_s3_objects_by_dirname = groupby_unsorted(
949
- s3_objects,
950
- key=lambda obj: obj.key.rsplit("/", 1)[0],
951
- )
952
- for _, group in grouped_s3_objects_by_dirname:
953
- max_file = max(group, key=lambda x: x.last_modified)
954
- max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
955
999
 
956
1000
  # If partition_id is None, it means the folder is not a partition
957
1001
  partition_id = path_spec.get_partition_from_path(max_file_s3_path)
@@ -959,37 +1003,100 @@ class S3Source(StatefulIngestionSourceBase):
959
1003
  yield Folder(
960
1004
  partition_id=partition_id,
961
1005
  is_partition=bool(partition_id),
962
- creation_time=min(obj.last_modified for obj in group),
963
- modification_time=max_file.last_modified,
1006
+ creation_time=folder_info.min_time,
1007
+ modification_time=folder_info.max_time,
964
1008
  sample_file=max_file_s3_path,
965
- size=sum(obj.size for obj in group),
1009
+ size=folder_info.total_size,
966
1010
  )
967
1011
 
1012
+ def create_s3_path(self, bucket_name: str, key: str) -> str:
1013
+ return f"s3://{bucket_name}/{key}"
1014
+
968
1015
  def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
1016
+ """
1017
+ Main entry point for browsing S3 objects and creating table-level datasets.
1018
+
1019
+ This method determines whether to use templated processing (for paths with {table})
1020
+ or simple file-by-file processing (for paths without templates).
1021
+
1022
+ Args:
1023
+ path_spec: Configuration specifying the S3 path pattern to scan
1024
+ sample_size: Number of files to sample (used in simple processing)
1025
+
1026
+ Returns:
1027
+ Iterator of BrowsePath objects representing datasets to be created
1028
+
1029
+ Examples:
1030
+ - Templated: s3://bucket/data/*/{table}/** -> Groups files by table
1031
+ - Simple: s3://bucket/data/*.csv -> Processes individual files
1032
+ """
969
1033
  if self.source_config.aws_config is None:
970
1034
  raise ValueError("aws_config not set. Cannot browse s3")
1035
+
971
1036
  s3 = self.source_config.aws_config.get_s3_resource(
972
1037
  self.source_config.verify_ssl
973
1038
  )
974
1039
  bucket_name = get_bucket_name(path_spec.include)
975
- logger.debug(f"Scanning bucket: {bucket_name}")
976
1040
  bucket = s3.Bucket(bucket_name)
977
- prefix = self.get_prefix(get_bucket_relative_path(path_spec.include))
978
- logger.debug(f"Scanning objects with prefix:{prefix}")
1041
+
1042
+ logger.debug(f"Scanning bucket: {bucket_name}")
1043
+ logger.info(f"Processing path spec: {path_spec.include}")
1044
+
1045
+ # Check if we have {table} template in the path
1046
+ has_table_template = "{table}" in path_spec.include
1047
+
1048
+ logger.info(f"Has table template: {has_table_template}")
1049
+
1050
+ if has_table_template:
1051
+ logger.info("Using templated path processing")
1052
+ # Always use templated processing when {table} is present
1053
+ # This groups files under table-level datasets
1054
+ yield from self._process_templated_path(path_spec, bucket, bucket_name)
1055
+ else:
1056
+ logger.info("Using simple path processing")
1057
+ # Only use simple processing for non-templated paths
1058
+ # This creates individual file-level datasets
1059
+ yield from self._process_simple_path(path_spec, bucket, bucket_name)
1060
+
1061
+ def _process_templated_path(
1062
+ self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
1063
+ ) -> Iterable[BrowsePath]:
1064
+ """
1065
+ Process S3 paths containing {table} templates to create table-level datasets.
1066
+
1067
+ This method handles complex path patterns with wildcards and templates by:
1068
+ 1. Replacing template placeholders with stars (except {table})
1069
+ 2. Resolving wildcards in the path up to the {table} marker
1070
+ 3. Finding all potential table folders under each resolved path
1071
+ 4. Applying configurable partition traversal strategy (ALL, MAX, MIN_MAX)
1072
+ 5. Aggregating files from selected partitions under each table
1073
+ 6. Creating one dataset per table (not per file)
1074
+
1075
+ Args:
1076
+ path_spec: Path specification with {table} template
1077
+ bucket: S3 bucket resource
1078
+ bucket_name: Name of the S3 bucket
1079
+
1080
+ Yields:
1081
+ BrowsePath: One per table (not per file), containing aggregated metadata
1082
+ """
1083
+ # Find the part before {table}
1084
+ table_marker = "{table}"
1085
+ if table_marker not in path_spec.include:
1086
+ logger.info("No {table} marker found in path")
1087
+ return
1088
+
1089
+ # STEP 1: Replace template placeholders with stars (except {table}) to enable folder resolution
1090
+ # This is the crucial missing logic from the original implementation
979
1091
  matches = re.finditer(r"{\s*\w+\s*}", path_spec.include, re.MULTILINE)
980
1092
  matches_list = list(matches)
981
- if matches_list and path_spec.sample_files:
982
- # Replace the patch_spec include's templates with star because later we want to resolve all the stars
983
- # to actual directories.
984
- # For example:
985
- # "s3://my-test-bucket/*/{dept}/*/{table}/*/*.*" -> "s3://my-test-bucket/*/*/*/{table}/*/*.*"
986
- # We only keep the last template as a marker to know the point util we need to resolve path.
987
- # After the marker we can safely get sample files for sampling because it is not used in the
988
- # table name, so we don't need all the files.
989
- # This speed up processing but we won't be able to get a precise modification date/size/number of files.
1093
+
1094
+ if matches_list:
1095
+ # Replace all templates with stars except keep {table} as the marker
990
1096
  max_start: int = -1
991
1097
  include: str = path_spec.include
992
1098
  max_match: str = ""
1099
+
993
1100
  for match in matches_list:
994
1101
  pos = include.find(match.group())
995
1102
  if pos > max_start:
@@ -1001,120 +1108,249 @@ class S3Source(StatefulIngestionSourceBase):
1001
1108
  if max_match == "{table}":
1002
1109
  break
1003
1110
 
1004
- table_index = include.find(max_match)
1005
- for folder in self.resolve_templated_folders(
1006
- bucket_name, get_bucket_relative_path(include[:table_index])
1007
- ):
1008
- try:
1009
- for f in list_folders(
1010
- bucket_name, f"{folder}", self.source_config.aws_config
1011
- ):
1012
- table_path = self.create_s3_path(bucket_name, f)
1013
- table_name, _ = path_spec.extract_table_name_and_path(
1014
- table_path
1111
+ logger.info(f"Template replacement: {path_spec.include} -> {include}")
1112
+ else:
1113
+ include = path_spec.include
1114
+
1115
+ # Split the path at {table} to get the prefix that needs wildcard resolution
1116
+ prefix_before_table = include.split(table_marker)[0]
1117
+ # Remove the s3:// and bucket name to get the relative path
1118
+ relative_path = get_bucket_relative_path(prefix_before_table)
1119
+
1120
+ logger.info(f"Prefix before table: {prefix_before_table}")
1121
+ logger.info(f"Relative path for resolution: {relative_path}")
1122
+
1123
+ try:
1124
+ # STEP 2: Resolve ALL wildcards in the path up to {table}
1125
+ # This converts patterns like "data/*/logs/" to actual paths like ["data/2023/logs/", "data/2024/logs/"]
1126
+ table_index = include.find(table_marker)
1127
+ folder_prefix = get_bucket_relative_path(include[:table_index])
1128
+
1129
+ resolved_prefixes = list(
1130
+ self.resolve_templated_folders(bucket_name, folder_prefix)
1131
+ )
1132
+ logger.info(f"Resolved prefixes: {resolved_prefixes}")
1133
+
1134
+ # STEP 3: Process each resolved prefix to find table folders
1135
+ for resolved_prefix in resolved_prefixes:
1136
+ logger.info(f"Processing resolved prefix: {resolved_prefix}")
1137
+
1138
+ # Get all folders that could be tables under this resolved prefix
1139
+ # These are the actual table names (e.g., "users", "events", "logs")
1140
+ table_folders = list(
1141
+ list_folders(
1142
+ bucket_name, resolved_prefix, self.source_config.aws_config
1143
+ )
1144
+ )
1145
+ logger.debug(
1146
+ f"Found table folders under {resolved_prefix}: {table_folders}"
1147
+ )
1148
+
1149
+ # STEP 4: Process each table folder to create a table-level dataset
1150
+ for table_folder in table_folders:
1151
+ # Create the full S3 path for this table
1152
+ table_s3_path = self.create_s3_path(
1153
+ bucket_name, table_folder.rstrip("/")
1154
+ )
1155
+ logger.info(
1156
+ f"Processing table folder: {table_folder} -> {table_s3_path}"
1157
+ )
1158
+
1159
+ # Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
1160
+ # This uses the compiled regex pattern to extract the table name from the full path
1161
+ table_name, table_path = path_spec.extract_table_name_and_path(
1162
+ table_s3_path
1163
+ )
1164
+
1165
+ # Apply table name filtering if configured
1166
+ if not path_spec.tables_filter_pattern.allowed(table_name):
1167
+ logger.debug(f"Table '{table_name}' not allowed and skipping")
1168
+ continue
1169
+
1170
+ # STEP 5: Handle partition traversal based on configuration
1171
+ # Get all partition folders first
1172
+ all_partition_folders = list(
1173
+ list_folders(
1174
+ bucket_name, table_folder, self.source_config.aws_config
1015
1175
  )
1016
- if not path_spec.tables_filter_pattern.allowed(table_name):
1017
- logger.debug(
1018
- f"Table '{table_name}' not allowed and skipping"
1019
- )
1020
- self.report.report_file_dropped(table_path)
1021
- continue
1176
+ )
1177
+ logger.info(
1178
+ f"Found {len(all_partition_folders)} partition folders under table {table_name} using method {path_spec.traversal_method}"
1179
+ )
1022
1180
 
1181
+ if all_partition_folders:
1182
+ # Apply the same traversal logic as the original code
1023
1183
  dirs_to_process = []
1024
- logger.info(f"Processing folder: {f}")
1184
+
1025
1185
  if path_spec.traversal_method == FolderTraversalMethod.ALL:
1026
- dirs_to_process.append(f)
1186
+ # Process ALL partitions (original behavior)
1187
+ dirs_to_process = all_partition_folders
1188
+ logger.debug(
1189
+ f"Processing ALL {len(all_partition_folders)} partitions"
1190
+ )
1191
+
1027
1192
  else:
1193
+ # Use the original get_dir_to_process logic for MIN/MAX
1194
+ protocol = "s3://" # Default protocol for S3
1195
+
1028
1196
  if (
1029
1197
  path_spec.traversal_method
1030
1198
  == FolderTraversalMethod.MIN_MAX
1031
1199
  or path_spec.traversal_method
1032
1200
  == FolderTraversalMethod.MAX
1033
1201
  ):
1034
- protocol = ContainerWUCreator.get_protocol(
1035
- path_spec.include
1036
- )
1202
+ # Get MAX partition using original logic
1037
1203
  dirs_to_process_max = self.get_dir_to_process(
1038
1204
  bucket_name=bucket_name,
1039
- folder=f + "/",
1205
+ folder=table_folder + "/",
1040
1206
  path_spec=path_spec,
1041
1207
  protocol=protocol,
1208
+ min=False,
1042
1209
  )
1043
- dirs_to_process.append(dirs_to_process_max[0])
1210
+ if dirs_to_process_max:
1211
+ # Convert full S3 paths back to relative paths for processing
1212
+ dirs_to_process.extend(
1213
+ [
1214
+ d.replace(f"{protocol}{bucket_name}/", "")
1215
+ for d in dirs_to_process_max
1216
+ ]
1217
+ )
1218
+ logger.debug(
1219
+ f"Added MAX partition: {dirs_to_process_max}"
1220
+ )
1044
1221
 
1045
1222
  if (
1046
1223
  path_spec.traversal_method
1047
1224
  == FolderTraversalMethod.MIN_MAX
1048
1225
  ):
1226
+ # Get MIN partition using original logic
1049
1227
  dirs_to_process_min = self.get_dir_to_process(
1050
1228
  bucket_name=bucket_name,
1051
- folder=f + "/",
1229
+ folder=table_folder + "/",
1052
1230
  path_spec=path_spec,
1053
1231
  protocol=protocol,
1054
1232
  min=True,
1055
1233
  )
1056
- dirs_to_process.append(dirs_to_process_min[0])
1057
- folders: List[Folder] = []
1058
- for dir in dirs_to_process:
1059
- logger.info(f"Getting files from folder: {dir}")
1060
- prefix_to_process = urlparse(dir).path.lstrip("/")
1061
-
1062
- folders.extend(
1063
- self.get_folder_info(
1064
- path_spec, bucket, prefix_to_process
1065
- )
1234
+ if dirs_to_process_min:
1235
+ # Convert full S3 paths back to relative paths for processing
1236
+ dirs_to_process.extend(
1237
+ [
1238
+ d.replace(f"{protocol}{bucket_name}/", "")
1239
+ for d in dirs_to_process_min
1240
+ ]
1241
+ )
1242
+ logger.debug(
1243
+ f"Added MIN partition: {dirs_to_process_min}"
1244
+ )
1245
+
1246
+ # Process the selected partitions
1247
+ all_folders = []
1248
+ for partition_folder in dirs_to_process:
1249
+ # Ensure we have a clean folder path
1250
+ clean_folder = partition_folder.rstrip("/")
1251
+
1252
+ logger.info(f"Scanning files in partition: {clean_folder}")
1253
+ partition_files = list(
1254
+ self.get_folder_info(path_spec, bucket, clean_folder)
1255
+ )
1256
+ all_folders.extend(partition_files)
1257
+
1258
+ if all_folders:
1259
+ # Use the most recent file across all processed partitions
1260
+ latest_file = max(
1261
+ all_folders, key=lambda x: x.modification_time
1262
+ )
1263
+
1264
+ # Get partition information
1265
+ partitions = [f for f in all_folders if f.is_partition]
1266
+
1267
+ # Calculate total size of processed partitions
1268
+ total_size = sum(f.size for f in all_folders)
1269
+
1270
+ # Create ONE BrowsePath per table
1271
+ # The key insight: we need to provide the sample file for schema inference
1272
+ # but the table path should be extracted correctly by extract_table_name_and_path
1273
+ yield BrowsePath(
1274
+ file=latest_file.sample_file, # Sample file for schema inference
1275
+ timestamp=latest_file.modification_time, # Latest timestamp
1276
+ size=total_size, # Size of processed partitions
1277
+ partitions=partitions, # Partition metadata
1066
1278
  )
1067
- max_folder = None
1068
- if folders:
1069
- max_folder = max(folders, key=lambda x: x.modification_time)
1070
- if not max_folder:
1279
+ else:
1071
1280
  logger.warning(
1072
- f"Unable to find any files in the folder {dir}. Skipping..."
1281
+ f"No files found in processed partitions for table {table_name}"
1073
1282
  )
1074
- continue
1075
-
1076
- partitions = list(filter(lambda x: x.is_partition, folders))
1077
- yield BrowsePath(
1078
- file=max_folder.sample_file,
1079
- timestamp=max_folder.modification_time,
1080
- size=max_folder.size,
1081
- partitions=partitions,
1082
- # TODO: Support content type inference for partitions
1083
- )
1084
- except Exception as e:
1085
- # This odd check if being done because boto does not have a proper exception to catch
1086
- # The exception that appears in stacktrace cannot actually be caught without a lot more work
1087
- # https://github.com/boto/boto3/issues/1195
1088
- if "NoSuchBucket" in repr(e):
1089
- logger.debug(f"Got NoSuchBucket exception for {bucket_name}", e)
1090
- self.get_report().report_warning(
1091
- "Missing bucket", f"No bucket found {bucket_name}"
1092
- )
1093
1283
  else:
1094
- raise e
1095
- else:
1096
- logger.debug(
1097
- "No template in the pathspec can't do sampling, fallbacking to do full scan"
1098
- )
1099
- path_spec.sample_files = False
1100
- for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
1101
- s3_path = self.create_s3_path(obj.bucket_name, obj.key)
1102
- logger.debug(f"Path: {s3_path}")
1103
-
1104
- content_type = None
1105
- if self.source_config.use_s3_content_type:
1106
- content_type = s3.Object(obj.bucket_name, obj.key).content_type
1107
-
1108
- yield BrowsePath(
1109
- file=s3_path,
1110
- timestamp=obj.last_modified,
1111
- size=obj.size,
1112
- partitions=[],
1113
- content_type=content_type,
1284
+ logger.warning(
1285
+ f"No partition folders found under table {table_name}"
1286
+ )
1287
+
1288
+ except Exception as e:
1289
+ if "NoSuchBucket" in repr(e):
1290
+ self.get_report().report_warning(
1291
+ "Missing bucket", f"No bucket found {bucket_name}"
1114
1292
  )
1293
+ return
1294
+ logger.error(f"Error in _process_templated_path: {e}")
1295
+ raise e
1115
1296
 
1116
- def create_s3_path(self, bucket_name: str, key: str) -> str:
1117
- return f"s3://{bucket_name}/{key}"
1297
+ def _process_simple_path(
1298
+ self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
1299
+ ) -> Iterable[BrowsePath]:
1300
+ """
1301
+ Process simple S3 paths without {table} templates to create file-level datasets.
1302
+
1303
+ This method handles straightforward file patterns by:
1304
+ 1. Listing all files matching the pattern
1305
+ 2. Creating one dataset per file
1306
+ 3. No aggregation or grouping is performed
1307
+
1308
+ Use Cases:
1309
+ - Individual file processing: s3://bucket/data/*.csv
1310
+ - Direct file paths: s3://bucket/data/myfile.json
1311
+ - Patterns without table grouping: s3://bucket/logs/*.log
1312
+
1313
+ Args:
1314
+ path_spec: Path specification without {table} template
1315
+ bucket: S3 bucket resource
1316
+ bucket_name: Name of the S3 bucket
1317
+
1318
+ Yields:
1319
+ BrowsePath: One per file, containing individual file metadata
1320
+
1321
+ Example Output:
1322
+ - BrowsePath(file="data/file1.csv", size=1000, partitions=[])
1323
+ - BrowsePath(file="data/file2.csv", size=2000, partitions=[])
1324
+ """
1325
+ assert self.source_config.aws_config is not None, "aws_config not set"
1326
+
1327
+ path_spec.sample_files = False # Disable sampling for simple paths
1328
+
1329
+ # Extract the prefix from the path spec (stops at first wildcard)
1330
+ prefix = self.get_prefix(get_bucket_relative_path(path_spec.include))
1331
+
1332
+ # Get s3 resource for content type checking
1333
+ s3 = self.source_config.aws_config.get_s3_resource(
1334
+ self.source_config.verify_ssl
1335
+ )
1336
+
1337
+ # Iterate through all objects in the bucket matching the prefix
1338
+ for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
1339
+ s3_path = self.create_s3_path(obj.bucket_name, obj.key)
1340
+
1341
+ # Get content type if configured
1342
+ content_type = None
1343
+ if self.source_config.use_s3_content_type:
1344
+ content_type = s3.Object(obj.bucket_name, obj.key).content_type
1345
+
1346
+ # Create one BrowsePath per file
1347
+ yield BrowsePath(
1348
+ file=s3_path,
1349
+ timestamp=obj.last_modified,
1350
+ size=obj.size,
1351
+ partitions=[], # No partitions in simple mode
1352
+ content_type=content_type,
1353
+ )
1118
1354
 
1119
1355
  def local_browser(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
1120
1356
  prefix = self.get_prefix(path_spec.include)
@@ -33,7 +33,10 @@ from datahub.ingestion.api.decorators import (
33
33
  )
34
34
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
35
35
  from datahub.ingestion.api.workunit import MetadataWorkUnit
36
- from datahub.ingestion.source.common.subtypes import DatasetSubTypes
36
+ from datahub.ingestion.source.common.subtypes import (
37
+ DatasetSubTypes,
38
+ SourceCapabilityModifier,
39
+ )
37
40
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
38
41
  StaleEntityRemovalHandler,
39
42
  StaleEntityRemovalSourceReport,
@@ -532,11 +535,11 @@ class SalesforceApi:
532
535
  @capability(
533
536
  capability_name=SourceCapability.DATA_PROFILING,
534
537
  description="Only table level profiling is supported via `profiling.enabled` config field",
538
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
535
539
  )
536
540
  @capability(
537
541
  capability_name=SourceCapability.DELETION_DETECTION,
538
- description="Not supported yet",
539
- supported=False,
542
+ description="Enabled by default via stateful ingestion",
540
543
  )
541
544
  @capability(
542
545
  capability_name=SourceCapability.SCHEMA_METADATA,