acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show
  1. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
  2. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
  3. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/dataset/dataset.py +1 -1
  6. datahub/api/entities/external/__init__.py +0 -0
  7. datahub/api/entities/external/external_entities.py +239 -0
  8. datahub/api/entities/external/external_tag.py +145 -0
  9. datahub/api/entities/external/lake_formation_external_entites.py +161 -0
  10. datahub/api/entities/external/restricted_text.py +247 -0
  11. datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
  12. datahub/cli/check_cli.py +88 -7
  13. datahub/cli/cli_utils.py +63 -0
  14. datahub/cli/container_cli.py +5 -0
  15. datahub/cli/delete_cli.py +124 -27
  16. datahub/cli/docker_check.py +107 -12
  17. datahub/cli/docker_cli.py +149 -227
  18. datahub/cli/exists_cli.py +0 -2
  19. datahub/cli/get_cli.py +0 -2
  20. datahub/cli/iceberg_cli.py +5 -0
  21. datahub/cli/ingest_cli.py +3 -15
  22. datahub/cli/migrate.py +2 -0
  23. datahub/cli/put_cli.py +1 -4
  24. datahub/cli/quickstart_versioning.py +50 -7
  25. datahub/cli/specific/assertions_cli.py +0 -4
  26. datahub/cli/specific/datacontract_cli.py +0 -3
  27. datahub/cli/specific/dataproduct_cli.py +0 -11
  28. datahub/cli/specific/dataset_cli.py +1 -8
  29. datahub/cli/specific/forms_cli.py +0 -4
  30. datahub/cli/specific/group_cli.py +0 -2
  31. datahub/cli/specific/structuredproperties_cli.py +1 -4
  32. datahub/cli/specific/user_cli.py +0 -2
  33. datahub/cli/state_cli.py +0 -2
  34. datahub/cli/timeline_cli.py +0 -2
  35. datahub/configuration/pydantic_migration_helpers.py +7 -5
  36. datahub/emitter/rest_emitter.py +70 -12
  37. datahub/entrypoints.py +4 -3
  38. datahub/ingestion/api/decorators.py +15 -3
  39. datahub/ingestion/api/report.py +332 -3
  40. datahub/ingestion/api/sink.py +3 -0
  41. datahub/ingestion/api/source.py +48 -44
  42. datahub/ingestion/autogenerated/__init__.py +0 -0
  43. datahub/ingestion/autogenerated/capability_summary.json +3449 -0
  44. datahub/ingestion/autogenerated/lineage.json +401 -0
  45. datahub/ingestion/autogenerated/lineage_helper.py +177 -0
  46. datahub/ingestion/extractor/schema_util.py +13 -4
  47. datahub/ingestion/glossary/classification_mixin.py +5 -0
  48. datahub/ingestion/graph/client.py +100 -15
  49. datahub/ingestion/graph/config.py +1 -0
  50. datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
  51. datahub/ingestion/run/pipeline.py +54 -2
  52. datahub/ingestion/sink/datahub_rest.py +13 -0
  53. datahub/ingestion/source/abs/source.py +1 -1
  54. datahub/ingestion/source/aws/aws_common.py +4 -0
  55. datahub/ingestion/source/aws/glue.py +489 -244
  56. datahub/ingestion/source/aws/tag_entities.py +292 -0
  57. datahub/ingestion/source/azure/azure_common.py +2 -2
  58. datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
  59. datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
  60. datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
  61. datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
  62. datahub/ingestion/source/bigquery_v2/common.py +1 -1
  63. datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
  64. datahub/ingestion/source/bigquery_v2/queries.py +3 -3
  65. datahub/ingestion/source/cassandra/cassandra.py +1 -1
  66. datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
  67. datahub/ingestion/source/common/subtypes.py +45 -0
  68. datahub/ingestion/source/data_lake_common/object_store.py +115 -27
  69. datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
  70. datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
  71. datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
  72. datahub/ingestion/source/dbt/dbt_common.py +6 -2
  73. datahub/ingestion/source/dbt/dbt_core.py +3 -0
  74. datahub/ingestion/source/debug/__init__.py +0 -0
  75. datahub/ingestion/source/debug/datahub_debug.py +300 -0
  76. datahub/ingestion/source/dremio/dremio_api.py +114 -73
  77. datahub/ingestion/source/dremio/dremio_config.py +2 -0
  78. datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
  79. datahub/ingestion/source/dremio/dremio_source.py +94 -81
  80. datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
  81. datahub/ingestion/source/file.py +3 -0
  82. datahub/ingestion/source/fivetran/fivetran.py +34 -26
  83. datahub/ingestion/source/gcs/gcs_source.py +13 -2
  84. datahub/ingestion/source/ge_data_profiler.py +76 -28
  85. datahub/ingestion/source/ge_profiling_config.py +11 -0
  86. datahub/ingestion/source/hex/api.py +26 -1
  87. datahub/ingestion/source/iceberg/iceberg.py +3 -1
  88. datahub/ingestion/source/identity/azure_ad.py +1 -1
  89. datahub/ingestion/source/identity/okta.py +1 -14
  90. datahub/ingestion/source/kafka/kafka.py +16 -0
  91. datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
  92. datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
  93. datahub/ingestion/source/looker/looker_source.py +1 -0
  94. datahub/ingestion/source/mlflow.py +11 -1
  95. datahub/ingestion/source/mock_data/__init__.py +0 -0
  96. datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
  97. datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
  98. datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
  99. datahub/ingestion/source/nifi.py +1 -1
  100. datahub/ingestion/source/powerbi/powerbi.py +1 -5
  101. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
  102. datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
  103. datahub/ingestion/source/preset.py +2 -2
  104. datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
  105. datahub/ingestion/source/redshift/redshift.py +21 -1
  106. datahub/ingestion/source/redshift/usage.py +4 -3
  107. datahub/ingestion/source/s3/report.py +4 -2
  108. datahub/ingestion/source/s3/source.py +367 -115
  109. datahub/ingestion/source/sac/sac.py +3 -1
  110. datahub/ingestion/source/salesforce.py +6 -3
  111. datahub/ingestion/source/sigma/sigma.py +7 -1
  112. datahub/ingestion/source/slack/slack.py +2 -1
  113. datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
  114. datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
  115. datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
  116. datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
  117. datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
  118. datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
  119. datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
  120. datahub/ingestion/source/sql/athena.py +119 -11
  121. datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
  122. datahub/ingestion/source/sql/clickhouse.py +3 -1
  123. datahub/ingestion/source/sql/cockroachdb.py +0 -1
  124. datahub/ingestion/source/sql/hana.py +3 -1
  125. datahub/ingestion/source/sql/hive_metastore.py +3 -11
  126. datahub/ingestion/source/sql/mariadb.py +0 -1
  127. datahub/ingestion/source/sql/mssql/source.py +239 -34
  128. datahub/ingestion/source/sql/mysql.py +0 -1
  129. datahub/ingestion/source/sql/oracle.py +1 -1
  130. datahub/ingestion/source/sql/postgres.py +0 -1
  131. datahub/ingestion/source/sql/sql_common.py +121 -34
  132. datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
  133. datahub/ingestion/source/sql/teradata.py +997 -235
  134. datahub/ingestion/source/sql/vertica.py +10 -6
  135. datahub/ingestion/source/sql_queries.py +2 -2
  136. datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
  137. datahub/ingestion/source/superset.py +58 -3
  138. datahub/ingestion/source/tableau/tableau.py +58 -37
  139. datahub/ingestion/source/tableau/tableau_common.py +4 -2
  140. datahub/ingestion/source/tableau/tableau_constant.py +0 -4
  141. datahub/ingestion/source/unity/config.py +5 -0
  142. datahub/ingestion/source/unity/proxy.py +118 -0
  143. datahub/ingestion/source/unity/source.py +195 -17
  144. datahub/ingestion/source/unity/tag_entities.py +295 -0
  145. datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
  146. datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
  147. datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
  148. datahub/integrations/assertion/snowflake/compiler.py +4 -3
  149. datahub/metadata/_internal_schema_classes.py +1522 -569
  150. datahub/metadata/_urns/urn_defs.py +1826 -1658
  151. datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
  152. datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
  153. datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
  154. datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
  155. datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
  156. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  157. datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
  158. datahub/metadata/schema.avsc +17758 -17097
  159. datahub/metadata/schemas/ApplicationKey.avsc +31 -0
  160. datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
  161. datahub/metadata/schemas/Applications.avsc +38 -0
  162. datahub/metadata/schemas/ChartKey.avsc +1 -0
  163. datahub/metadata/schemas/ContainerKey.avsc +1 -0
  164. datahub/metadata/schemas/ContainerProperties.avsc +8 -0
  165. datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
  166. datahub/metadata/schemas/DashboardKey.avsc +1 -0
  167. datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
  168. datahub/metadata/schemas/DataFlowKey.avsc +1 -0
  169. datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
  170. datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
  171. datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
  172. datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
  173. datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
  174. datahub/metadata/schemas/DataJobInfo.avsc +8 -0
  175. datahub/metadata/schemas/DataJobKey.avsc +1 -0
  176. datahub/metadata/schemas/DataProcessKey.avsc +8 -0
  177. datahub/metadata/schemas/DataProductKey.avsc +1 -0
  178. datahub/metadata/schemas/DataProductProperties.avsc +1 -1
  179. datahub/metadata/schemas/DatasetKey.avsc +11 -1
  180. datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
  181. datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
  182. datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
  183. datahub/metadata/schemas/LogicalParent.avsc +140 -0
  184. datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
  185. datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
  186. datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
  187. datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
  188. datahub/metadata/schemas/MLModelKey.avsc +9 -0
  189. datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
  190. datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
  191. datahub/metadata/schemas/NotebookKey.avsc +1 -0
  192. datahub/metadata/schemas/QuerySubjects.avsc +1 -12
  193. datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
  194. datahub/metadata/schemas/__init__.py +3 -3
  195. datahub/sdk/__init__.py +2 -0
  196. datahub/sdk/_all_entities.py +7 -0
  197. datahub/sdk/_shared.py +116 -0
  198. datahub/sdk/chart.py +315 -0
  199. datahub/sdk/container.py +7 -0
  200. datahub/sdk/dashboard.py +432 -0
  201. datahub/sdk/dataflow.py +7 -0
  202. datahub/sdk/datajob.py +45 -13
  203. datahub/sdk/dataset.py +8 -2
  204. datahub/sdk/entity_client.py +82 -2
  205. datahub/sdk/lineage_client.py +683 -82
  206. datahub/sdk/main_client.py +46 -16
  207. datahub/sdk/mlmodel.py +101 -38
  208. datahub/sdk/mlmodelgroup.py +7 -0
  209. datahub/sdk/search_client.py +4 -3
  210. datahub/sdk/search_filters.py +95 -27
  211. datahub/specific/chart.py +1 -1
  212. datahub/specific/dataproduct.py +4 -0
  213. datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
  214. datahub/sql_parsing/sqlglot_lineage.py +62 -13
  215. datahub/telemetry/telemetry.py +17 -11
  216. datahub/testing/sdk_v2_helpers.py +7 -1
  217. datahub/upgrade/upgrade.py +56 -14
  218. datahub/utilities/server_config_util.py +8 -0
  219. datahub/utilities/sqlalchemy_query_combiner.py +5 -2
  220. datahub/utilities/stats_collections.py +4 -0
  221. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
  222. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
  223. {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,6 @@ import time
8
8
  from datetime import datetime
9
9
  from pathlib import PurePath
10
10
  from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
11
- from urllib.parse import urlparse
12
11
 
13
12
  import smart_open.compression as so_compression
14
13
  from more_itertools import peekable
@@ -75,7 +74,6 @@ from datahub.metadata.schema_classes import (
75
74
  _Aspect,
76
75
  )
77
76
  from datahub.telemetry import stats, telemetry
78
- from datahub.utilities.groupby import groupby_unsorted
79
77
  from datahub.utilities.perf_timer import PerfTimer
80
78
 
81
79
  if TYPE_CHECKING:
@@ -162,6 +160,15 @@ class Folder:
162
160
  )
163
161
 
164
162
 
163
+ @dataclasses.dataclass
164
+ class FolderInfo:
165
+ objects: List[Any]
166
+ total_size: int
167
+ min_time: datetime
168
+ max_time: datetime
169
+ latest_obj: Any
170
+
171
+
165
172
  @dataclasses.dataclass
166
173
  class BrowsePath:
167
174
  file: str
@@ -675,7 +682,7 @@ class S3Source(StatefulIngestionSourceBase):
675
682
 
676
683
  logger.info(f"Extracting table schema from file: {table_data.full_path}")
677
684
  browse_path: str = (
678
- strip_s3_prefix(table_data.table_path)
685
+ self.strip_s3_prefix(table_data.table_path)
679
686
  if self.is_s3_platform()
680
687
  else table_data.table_path.strip("/")
681
688
  )
@@ -860,8 +867,18 @@ class S3Source(StatefulIngestionSourceBase):
860
867
  bucket_name, folder_split[0], self.source_config.aws_config
861
868
  )
862
869
  for folder in folders:
870
+ # Ensure proper path joining - folder already includes trailing slash from list_folders
871
+ # but we need to handle the case where folder_split[1] might start with a slash
872
+ remaining_pattern = folder_split[1]
873
+ if remaining_pattern.startswith("/"):
874
+ remaining_pattern = remaining_pattern[1:]
875
+
876
+ # Ensure folder ends with slash for proper path construction
877
+ if not folder.endswith("/"):
878
+ folder = folder + "/"
879
+
863
880
  yield from self.resolve_templated_folders(
864
- bucket_name, f"{folder}{folder_split[1]}"
881
+ bucket_name, f"{folder}{remaining_pattern}"
865
882
  )
866
883
 
867
884
  def get_dir_to_process(
@@ -932,26 +949,56 @@ class S3Source(StatefulIngestionSourceBase):
932
949
  """
933
950
 
934
951
  def _is_allowed_path(path_spec_: PathSpec, s3_uri: str) -> bool:
935
- allowed = path_spec_.allowed(s3_uri)
952
+ # Normalize URI for pattern matching
953
+ normalized_uri = self._normalize_uri_for_pattern_matching(s3_uri)
954
+
955
+ allowed = path_spec_.allowed(normalized_uri)
936
956
  if not allowed:
937
957
  logger.debug(f"File {s3_uri} not allowed and skipping")
938
958
  self.report.report_file_dropped(s3_uri)
939
959
  return allowed
940
960
 
941
- s3_objects = (
942
- obj
943
- for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
944
- if _is_allowed_path(
945
- path_spec, self.create_s3_path(obj.bucket_name, obj.key)
961
+ # Process objects in a memory-efficient streaming fashion
962
+ # Instead of loading all objects into memory, we'll accumulate folder data incrementally
963
+ folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
964
+
965
+ for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
966
+ s3_path = self.create_s3_path(obj.bucket_name, obj.key)
967
+
968
+ if not _is_allowed_path(path_spec, s3_path):
969
+ continue
970
+
971
+ # Extract the directory name (folder) from the object key
972
+ dirname = obj.key.rsplit("/", 1)[0]
973
+
974
+ # Initialize folder data if we haven't seen this directory before
975
+ if dirname not in folder_data:
976
+ folder_data[dirname] = FolderInfo(
977
+ objects=[],
978
+ total_size=0,
979
+ min_time=obj.last_modified,
980
+ max_time=obj.last_modified,
981
+ latest_obj=obj,
982
+ )
983
+
984
+ # Update folder statistics incrementally
985
+ folder_info = folder_data[dirname]
986
+ folder_info.objects.append(obj)
987
+ folder_info.total_size += obj.size
988
+
989
+ # Track min/max times and latest object
990
+ if obj.last_modified < folder_info.min_time:
991
+ folder_info.min_time = obj.last_modified
992
+ if obj.last_modified > folder_info.max_time:
993
+ folder_info.max_time = obj.last_modified
994
+ folder_info.latest_obj = obj
995
+
996
+ # Yield folders after processing all objects
997
+ for _dirname, folder_info in folder_data.items():
998
+ latest_obj = folder_info.latest_obj
999
+ max_file_s3_path = self.create_s3_path(
1000
+ latest_obj.bucket_name, latest_obj.key
946
1001
  )
947
- )
948
- grouped_s3_objects_by_dirname = groupby_unsorted(
949
- s3_objects,
950
- key=lambda obj: obj.key.rsplit("/", 1)[0],
951
- )
952
- for _, group in grouped_s3_objects_by_dirname:
953
- max_file = max(group, key=lambda x: x.last_modified)
954
- max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
955
1002
 
956
1003
  # If partition_id is None, it means the folder is not a partition
957
1004
  partition_id = path_spec.get_partition_from_path(max_file_s3_path)
@@ -959,37 +1006,100 @@ class S3Source(StatefulIngestionSourceBase):
959
1006
  yield Folder(
960
1007
  partition_id=partition_id,
961
1008
  is_partition=bool(partition_id),
962
- creation_time=min(obj.last_modified for obj in group),
963
- modification_time=max_file.last_modified,
1009
+ creation_time=folder_info.min_time,
1010
+ modification_time=folder_info.max_time,
964
1011
  sample_file=max_file_s3_path,
965
- size=sum(obj.size for obj in group),
1012
+ size=folder_info.total_size,
966
1013
  )
967
1014
 
1015
+ def create_s3_path(self, bucket_name: str, key: str) -> str:
1016
+ return f"s3://{bucket_name}/{key}"
1017
+
968
1018
  def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
1019
+ """
1020
+ Main entry point for browsing S3 objects and creating table-level datasets.
1021
+
1022
+ This method determines whether to use templated processing (for paths with {table})
1023
+ or simple file-by-file processing (for paths without templates).
1024
+
1025
+ Args:
1026
+ path_spec: Configuration specifying the S3 path pattern to scan
1027
+ sample_size: Number of files to sample (used in simple processing)
1028
+
1029
+ Returns:
1030
+ Iterator of BrowsePath objects representing datasets to be created
1031
+
1032
+ Examples:
1033
+ - Templated: s3://bucket/data/*/{table}/** -> Groups files by table
1034
+ - Simple: s3://bucket/data/*.csv -> Processes individual files
1035
+ """
969
1036
  if self.source_config.aws_config is None:
970
1037
  raise ValueError("aws_config not set. Cannot browse s3")
1038
+
971
1039
  s3 = self.source_config.aws_config.get_s3_resource(
972
1040
  self.source_config.verify_ssl
973
1041
  )
974
1042
  bucket_name = get_bucket_name(path_spec.include)
975
- logger.debug(f"Scanning bucket: {bucket_name}")
976
1043
  bucket = s3.Bucket(bucket_name)
977
- prefix = self.get_prefix(get_bucket_relative_path(path_spec.include))
978
- logger.debug(f"Scanning objects with prefix:{prefix}")
1044
+
1045
+ logger.debug(f"Scanning bucket: {bucket_name}")
1046
+ logger.info(f"Processing path spec: {path_spec.include}")
1047
+
1048
+ # Check if we have {table} template in the path
1049
+ has_table_template = "{table}" in path_spec.include
1050
+
1051
+ logger.info(f"Has table template: {has_table_template}")
1052
+
1053
+ if has_table_template:
1054
+ logger.info("Using templated path processing")
1055
+ # Always use templated processing when {table} is present
1056
+ # This groups files under table-level datasets
1057
+ yield from self._process_templated_path(path_spec, bucket, bucket_name)
1058
+ else:
1059
+ logger.info("Using simple path processing")
1060
+ # Only use simple processing for non-templated paths
1061
+ # This creates individual file-level datasets
1062
+ yield from self._process_simple_path(path_spec, bucket, bucket_name)
1063
+
1064
+ def _process_templated_path(
1065
+ self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
1066
+ ) -> Iterable[BrowsePath]:
1067
+ """
1068
+ Process S3 paths containing {table} templates to create table-level datasets.
1069
+
1070
+ This method handles complex path patterns with wildcards and templates by:
1071
+ 1. Replacing template placeholders with stars (except {table})
1072
+ 2. Resolving wildcards in the path up to the {table} marker
1073
+ 3. Finding all potential table folders under each resolved path
1074
+ 4. Applying configurable partition traversal strategy (ALL, MAX, MIN_MAX)
1075
+ 5. Aggregating files from selected partitions under each table
1076
+ 6. Creating one dataset per table (not per file)
1077
+
1078
+ Args:
1079
+ path_spec: Path specification with {table} template
1080
+ bucket: S3 bucket resource
1081
+ bucket_name: Name of the S3 bucket
1082
+
1083
+ Yields:
1084
+ BrowsePath: One per table (not per file), containing aggregated metadata
1085
+ """
1086
+ # Find the part before {table}
1087
+ table_marker = "{table}"
1088
+ if table_marker not in path_spec.include:
1089
+ logger.info("No {table} marker found in path")
1090
+ return
1091
+
1092
+ # STEP 1: Replace template placeholders with stars (except {table}) to enable folder resolution
1093
+ # This is the crucial missing logic from the original implementation
979
1094
  matches = re.finditer(r"{\s*\w+\s*}", path_spec.include, re.MULTILINE)
980
1095
  matches_list = list(matches)
981
- if matches_list and path_spec.sample_files:
982
- # Replace the patch_spec include's templates with star because later we want to resolve all the stars
983
- # to actual directories.
984
- # For example:
985
- # "s3://my-test-bucket/*/{dept}/*/{table}/*/*.*" -> "s3://my-test-bucket/*/*/*/{table}/*/*.*"
986
- # We only keep the last template as a marker to know the point util we need to resolve path.
987
- # After the marker we can safely get sample files for sampling because it is not used in the
988
- # table name, so we don't need all the files.
989
- # This speed up processing but we won't be able to get a precise modification date/size/number of files.
1096
+
1097
+ if matches_list:
1098
+ # Replace all templates with stars except keep {table} as the marker
990
1099
  max_start: int = -1
991
1100
  include: str = path_spec.include
992
1101
  max_match: str = ""
1102
+
993
1103
  for match in matches_list:
994
1104
  pos = include.find(match.group())
995
1105
  if pos > max_start:
@@ -1001,120 +1111,249 @@ class S3Source(StatefulIngestionSourceBase):
1001
1111
  if max_match == "{table}":
1002
1112
  break
1003
1113
 
1004
- table_index = include.find(max_match)
1005
- for folder in self.resolve_templated_folders(
1006
- bucket_name, get_bucket_relative_path(include[:table_index])
1007
- ):
1008
- try:
1009
- for f in list_folders(
1010
- bucket_name, f"{folder}", self.source_config.aws_config
1011
- ):
1012
- table_path = self.create_s3_path(bucket_name, f)
1013
- table_name, _ = path_spec.extract_table_name_and_path(
1014
- table_path
1114
+ logger.info(f"Template replacement: {path_spec.include} -> {include}")
1115
+ else:
1116
+ include = path_spec.include
1117
+
1118
+ # Split the path at {table} to get the prefix that needs wildcard resolution
1119
+ prefix_before_table = include.split(table_marker)[0]
1120
+ # Remove the s3:// and bucket name to get the relative path
1121
+ relative_path = get_bucket_relative_path(prefix_before_table)
1122
+
1123
+ logger.info(f"Prefix before table: {prefix_before_table}")
1124
+ logger.info(f"Relative path for resolution: {relative_path}")
1125
+
1126
+ try:
1127
+ # STEP 2: Resolve ALL wildcards in the path up to {table}
1128
+ # This converts patterns like "data/*/logs/" to actual paths like ["data/2023/logs/", "data/2024/logs/"]
1129
+ table_index = include.find(table_marker)
1130
+ folder_prefix = get_bucket_relative_path(include[:table_index])
1131
+
1132
+ resolved_prefixes = list(
1133
+ self.resolve_templated_folders(bucket_name, folder_prefix)
1134
+ )
1135
+ logger.info(f"Resolved prefixes: {resolved_prefixes}")
1136
+
1137
+ # STEP 3: Process each resolved prefix to find table folders
1138
+ for resolved_prefix in resolved_prefixes:
1139
+ logger.info(f"Processing resolved prefix: {resolved_prefix}")
1140
+
1141
+ # Get all folders that could be tables under this resolved prefix
1142
+ # These are the actual table names (e.g., "users", "events", "logs")
1143
+ table_folders = list(
1144
+ list_folders(
1145
+ bucket_name, resolved_prefix, self.source_config.aws_config
1146
+ )
1147
+ )
1148
+ logger.debug(
1149
+ f"Found table folders under {resolved_prefix}: {table_folders}"
1150
+ )
1151
+
1152
+ # STEP 4: Process each table folder to create a table-level dataset
1153
+ for table_folder in table_folders:
1154
+ # Create the full S3 path for this table
1155
+ table_s3_path = self.create_s3_path(
1156
+ bucket_name, table_folder.rstrip("/")
1157
+ )
1158
+ logger.info(
1159
+ f"Processing table folder: {table_folder} -> {table_s3_path}"
1160
+ )
1161
+
1162
+ # Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
1163
+ # This uses the compiled regex pattern to extract the table name from the full path
1164
+ table_name, table_path = path_spec.extract_table_name_and_path(
1165
+ table_s3_path
1166
+ )
1167
+
1168
+ # Apply table name filtering if configured
1169
+ if not path_spec.tables_filter_pattern.allowed(table_name):
1170
+ logger.debug(f"Table '{table_name}' not allowed and skipping")
1171
+ continue
1172
+
1173
+ # STEP 5: Handle partition traversal based on configuration
1174
+ # Get all partition folders first
1175
+ all_partition_folders = list(
1176
+ list_folders(
1177
+ bucket_name, table_folder, self.source_config.aws_config
1015
1178
  )
1016
- if not path_spec.tables_filter_pattern.allowed(table_name):
1017
- logger.debug(
1018
- f"Table '{table_name}' not allowed and skipping"
1019
- )
1020
- self.report.report_file_dropped(table_path)
1021
- continue
1179
+ )
1180
+ logger.info(
1181
+ f"Found {len(all_partition_folders)} partition folders under table {table_name} using method {path_spec.traversal_method}"
1182
+ )
1022
1183
 
1184
+ if all_partition_folders:
1185
+ # Apply the same traversal logic as the original code
1023
1186
  dirs_to_process = []
1024
- logger.info(f"Processing folder: {f}")
1187
+
1025
1188
  if path_spec.traversal_method == FolderTraversalMethod.ALL:
1026
- dirs_to_process.append(f)
1189
+ # Process ALL partitions (original behavior)
1190
+ dirs_to_process = all_partition_folders
1191
+ logger.debug(
1192
+ f"Processing ALL {len(all_partition_folders)} partitions"
1193
+ )
1194
+
1027
1195
  else:
1196
+ # Use the original get_dir_to_process logic for MIN/MAX
1197
+ protocol = "s3://" # Default protocol for S3
1198
+
1028
1199
  if (
1029
1200
  path_spec.traversal_method
1030
1201
  == FolderTraversalMethod.MIN_MAX
1031
1202
  or path_spec.traversal_method
1032
1203
  == FolderTraversalMethod.MAX
1033
1204
  ):
1034
- protocol = ContainerWUCreator.get_protocol(
1035
- path_spec.include
1036
- )
1205
+ # Get MAX partition using original logic
1037
1206
  dirs_to_process_max = self.get_dir_to_process(
1038
1207
  bucket_name=bucket_name,
1039
- folder=f + "/",
1208
+ folder=table_folder + "/",
1040
1209
  path_spec=path_spec,
1041
1210
  protocol=protocol,
1211
+ min=False,
1042
1212
  )
1043
- dirs_to_process.append(dirs_to_process_max[0])
1213
+ if dirs_to_process_max:
1214
+ # Convert full S3 paths back to relative paths for processing
1215
+ dirs_to_process.extend(
1216
+ [
1217
+ d.replace(f"{protocol}{bucket_name}/", "")
1218
+ for d in dirs_to_process_max
1219
+ ]
1220
+ )
1221
+ logger.debug(
1222
+ f"Added MAX partition: {dirs_to_process_max}"
1223
+ )
1044
1224
 
1045
1225
  if (
1046
1226
  path_spec.traversal_method
1047
1227
  == FolderTraversalMethod.MIN_MAX
1048
1228
  ):
1229
+ # Get MIN partition using original logic
1049
1230
  dirs_to_process_min = self.get_dir_to_process(
1050
1231
  bucket_name=bucket_name,
1051
- folder=f + "/",
1232
+ folder=table_folder + "/",
1052
1233
  path_spec=path_spec,
1053
1234
  protocol=protocol,
1054
1235
  min=True,
1055
1236
  )
1056
- dirs_to_process.append(dirs_to_process_min[0])
1057
- folders: List[Folder] = []
1058
- for dir in dirs_to_process:
1059
- logger.info(f"Getting files from folder: {dir}")
1060
- prefix_to_process = urlparse(dir).path.lstrip("/")
1061
-
1062
- folders.extend(
1063
- self.get_folder_info(
1064
- path_spec, bucket, prefix_to_process
1065
- )
1237
+ if dirs_to_process_min:
1238
+ # Convert full S3 paths back to relative paths for processing
1239
+ dirs_to_process.extend(
1240
+ [
1241
+ d.replace(f"{protocol}{bucket_name}/", "")
1242
+ for d in dirs_to_process_min
1243
+ ]
1244
+ )
1245
+ logger.debug(
1246
+ f"Added MIN partition: {dirs_to_process_min}"
1247
+ )
1248
+
1249
+ # Process the selected partitions
1250
+ all_folders = []
1251
+ for partition_folder in dirs_to_process:
1252
+ # Ensure we have a clean folder path
1253
+ clean_folder = partition_folder.rstrip("/")
1254
+
1255
+ logger.info(f"Scanning files in partition: {clean_folder}")
1256
+ partition_files = list(
1257
+ self.get_folder_info(path_spec, bucket, clean_folder)
1258
+ )
1259
+ all_folders.extend(partition_files)
1260
+
1261
+ if all_folders:
1262
+ # Use the most recent file across all processed partitions
1263
+ latest_file = max(
1264
+ all_folders, key=lambda x: x.modification_time
1265
+ )
1266
+
1267
+ # Get partition information
1268
+ partitions = [f for f in all_folders if f.is_partition]
1269
+
1270
+ # Calculate total size of processed partitions
1271
+ total_size = sum(f.size for f in all_folders)
1272
+
1273
+ # Create ONE BrowsePath per table
1274
+ # The key insight: we need to provide the sample file for schema inference
1275
+ # but the table path should be extracted correctly by extract_table_name_and_path
1276
+ yield BrowsePath(
1277
+ file=latest_file.sample_file, # Sample file for schema inference
1278
+ timestamp=latest_file.modification_time, # Latest timestamp
1279
+ size=total_size, # Size of processed partitions
1280
+ partitions=partitions, # Partition metadata
1066
1281
  )
1067
- max_folder = None
1068
- if folders:
1069
- max_folder = max(folders, key=lambda x: x.modification_time)
1070
- if not max_folder:
1282
+ else:
1071
1283
  logger.warning(
1072
- f"Unable to find any files in the folder {dir}. Skipping..."
1284
+ f"No files found in processed partitions for table {table_name}"
1073
1285
  )
1074
- continue
1075
-
1076
- partitions = list(filter(lambda x: x.is_partition, folders))
1077
- yield BrowsePath(
1078
- file=max_folder.sample_file,
1079
- timestamp=max_folder.modification_time,
1080
- size=max_folder.size,
1081
- partitions=partitions,
1082
- # TODO: Support content type inference for partitions
1083
- )
1084
- except Exception as e:
1085
- # This odd check if being done because boto does not have a proper exception to catch
1086
- # The exception that appears in stacktrace cannot actually be caught without a lot more work
1087
- # https://github.com/boto/boto3/issues/1195
1088
- if "NoSuchBucket" in repr(e):
1089
- logger.debug(f"Got NoSuchBucket exception for {bucket_name}", e)
1090
- self.get_report().report_warning(
1091
- "Missing bucket", f"No bucket found {bucket_name}"
1092
- )
1093
1286
  else:
1094
- raise e
1095
- else:
1096
- logger.debug(
1097
- "No template in the pathspec can't do sampling, fallbacking to do full scan"
1098
- )
1099
- path_spec.sample_files = False
1100
- for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
1101
- s3_path = self.create_s3_path(obj.bucket_name, obj.key)
1102
- logger.debug(f"Path: {s3_path}")
1103
-
1104
- content_type = None
1105
- if self.source_config.use_s3_content_type:
1106
- content_type = s3.Object(obj.bucket_name, obj.key).content_type
1107
-
1108
- yield BrowsePath(
1109
- file=s3_path,
1110
- timestamp=obj.last_modified,
1111
- size=obj.size,
1112
- partitions=[],
1113
- content_type=content_type,
1287
+ logger.warning(
1288
+ f"No partition folders found under table {table_name}"
1289
+ )
1290
+
1291
+ except Exception as e:
1292
+ if "NoSuchBucket" in repr(e):
1293
+ self.get_report().report_warning(
1294
+ "Missing bucket", f"No bucket found {bucket_name}"
1114
1295
  )
1296
+ return
1297
+ logger.error(f"Error in _process_templated_path: {e}")
1298
+ raise e
1115
1299
 
1116
- def create_s3_path(self, bucket_name: str, key: str) -> str:
1117
- return f"s3://{bucket_name}/{key}"
1300
+ def _process_simple_path(
1301
+ self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
1302
+ ) -> Iterable[BrowsePath]:
1303
+ """
1304
+ Process simple S3 paths without {table} templates to create file-level datasets.
1305
+
1306
+ This method handles straightforward file patterns by:
1307
+ 1. Listing all files matching the pattern
1308
+ 2. Creating one dataset per file
1309
+ 3. No aggregation or grouping is performed
1310
+
1311
+ Use Cases:
1312
+ - Individual file processing: s3://bucket/data/*.csv
1313
+ - Direct file paths: s3://bucket/data/myfile.json
1314
+ - Patterns without table grouping: s3://bucket/logs/*.log
1315
+
1316
+ Args:
1317
+ path_spec: Path specification without {table} template
1318
+ bucket: S3 bucket resource
1319
+ bucket_name: Name of the S3 bucket
1320
+
1321
+ Yields:
1322
+ BrowsePath: One per file, containing individual file metadata
1323
+
1324
+ Example Output:
1325
+ - BrowsePath(file="data/file1.csv", size=1000, partitions=[])
1326
+ - BrowsePath(file="data/file2.csv", size=2000, partitions=[])
1327
+ """
1328
+ assert self.source_config.aws_config is not None, "aws_config not set"
1329
+
1330
+ path_spec.sample_files = False # Disable sampling for simple paths
1331
+
1332
+ # Extract the prefix from the path spec (stops at first wildcard)
1333
+ prefix = self.get_prefix(get_bucket_relative_path(path_spec.include))
1334
+
1335
+ # Get s3 resource for content type checking
1336
+ s3 = self.source_config.aws_config.get_s3_resource(
1337
+ self.source_config.verify_ssl
1338
+ )
1339
+
1340
+ # Iterate through all objects in the bucket matching the prefix
1341
+ for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
1342
+ s3_path = self.create_s3_path(obj.bucket_name, obj.key)
1343
+
1344
+ # Get content type if configured
1345
+ content_type = None
1346
+ if self.source_config.use_s3_content_type:
1347
+ content_type = s3.Object(obj.bucket_name, obj.key).content_type
1348
+
1349
+ # Create one BrowsePath per file
1350
+ yield BrowsePath(
1351
+ file=s3_path,
1352
+ timestamp=obj.last_modified,
1353
+ size=obj.size,
1354
+ partitions=[], # No partitions in simple mode
1355
+ content_type=content_type,
1356
+ )
1118
1357
 
1119
1358
  def local_browser(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
1120
1359
  prefix = self.get_prefix(path_spec.include)
@@ -1158,8 +1397,13 @@ class S3Source(StatefulIngestionSourceBase):
1158
1397
  )
1159
1398
  table_dict: Dict[str, TableData] = {}
1160
1399
  for browse_path in file_browser:
1400
+ # Normalize URI for pattern matching
1401
+ normalized_file_path = self._normalize_uri_for_pattern_matching(
1402
+ browse_path.file
1403
+ )
1404
+
1161
1405
  if not path_spec.allowed(
1162
- browse_path.file,
1406
+ normalized_file_path,
1163
1407
  ignore_ext=self.is_s3_platform()
1164
1408
  and self.source_config.use_s3_content_type,
1165
1409
  ):
@@ -1235,5 +1479,13 @@ class S3Source(StatefulIngestionSourceBase):
1235
1479
  def is_s3_platform(self):
1236
1480
  return self.source_config.platform == "s3"
1237
1481
 
1482
+ def strip_s3_prefix(self, s3_uri: str) -> str:
1483
+ """Strip S3 prefix from URI. Can be overridden by adapters for other platforms."""
1484
+ return strip_s3_prefix(s3_uri)
1485
+
1486
+ def _normalize_uri_for_pattern_matching(self, uri: str) -> str:
1487
+ """Normalize URI for pattern matching. Can be overridden by adapters for other platforms."""
1488
+ return uri
1489
+
1238
1490
  def get_report(self):
1239
1491
  return self.report
@@ -178,7 +178,9 @@ class SACSourceReport(StaleEntityRemovalSourceReport):
178
178
  SourceCapability.LINEAGE_COARSE,
179
179
  "Enabled by default (only for Live Data Models)",
180
180
  )
181
- @capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion")
181
+ @capability(
182
+ SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
183
+ )
182
184
  @capability(
183
185
  SourceCapability.SCHEMA_METADATA,
184
186
  "Enabled by default (only for Import Data Models)",
@@ -33,7 +33,10 @@ from datahub.ingestion.api.decorators import (
33
33
  )
34
34
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
35
35
  from datahub.ingestion.api.workunit import MetadataWorkUnit
36
- from datahub.ingestion.source.common.subtypes import DatasetSubTypes
36
+ from datahub.ingestion.source.common.subtypes import (
37
+ DatasetSubTypes,
38
+ SourceCapabilityModifier,
39
+ )
37
40
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
38
41
  StaleEntityRemovalHandler,
39
42
  StaleEntityRemovalSourceReport,
@@ -532,11 +535,11 @@ class SalesforceApi:
532
535
  @capability(
533
536
  capability_name=SourceCapability.DATA_PROFILING,
534
537
  description="Only table level profiling is supported via `profiling.enabled` config field",
538
+ subtype_modifier=[SourceCapabilityModifier.TABLE],
535
539
  )
536
540
  @capability(
537
541
  capability_name=SourceCapability.DELETION_DETECTION,
538
- description="Not supported yet",
539
- supported=False,
542
+ description="Enabled by default via stateful ingestion",
540
543
  )
541
544
  @capability(
542
545
  capability_name=SourceCapability.SCHEMA_METADATA,
@@ -30,6 +30,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
30
30
  from datahub.ingestion.source.common.subtypes import (
31
31
  BIContainerSubTypes,
32
32
  DatasetSubTypes,
33
+ SourceCapabilityModifier,
33
34
  )
34
35
  from datahub.ingestion.source.sigma.config import (
35
36
  PlatformDetail,
@@ -95,7 +96,11 @@ logger = logging.getLogger(__name__)
95
96
  @platform_name("Sigma")
96
97
  @config_class(SigmaSourceConfig)
97
98
  @support_status(SupportStatus.INCUBATING)
98
- @capability(SourceCapability.CONTAINERS, "Enabled by default")
99
+ @capability(
100
+ SourceCapability.CONTAINERS,
101
+ "Enabled by default",
102
+ subtype_modifier=[SourceCapabilityModifier.SIGMA_WORKSPACE],
103
+ )
99
104
  @capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
100
105
  @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default.")
101
106
  @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
@@ -105,6 +110,7 @@ logger = logging.getLogger(__name__)
105
110
  SourceCapability.OWNERSHIP,
106
111
  "Enabled by default, configured using `ingest_owner`",
107
112
  )
113
+ @capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
108
114
  class SigmaSource(StatefulIngestionSourceBase, TestableSource):
109
115
  """
110
116
  This plugin extracts the following: