acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2558 -2531
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +221 -187
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1433 -546
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17736 -17112
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -8,7 +8,6 @@ import time
|
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
from pathlib import PurePath
|
|
10
10
|
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple
|
|
11
|
-
from urllib.parse import urlparse
|
|
12
11
|
|
|
13
12
|
import smart_open.compression as so_compression
|
|
14
13
|
from more_itertools import peekable
|
|
@@ -75,7 +74,6 @@ from datahub.metadata.schema_classes import (
|
|
|
75
74
|
_Aspect,
|
|
76
75
|
)
|
|
77
76
|
from datahub.telemetry import stats, telemetry
|
|
78
|
-
from datahub.utilities.groupby import groupby_unsorted
|
|
79
77
|
from datahub.utilities.perf_timer import PerfTimer
|
|
80
78
|
|
|
81
79
|
if TYPE_CHECKING:
|
|
@@ -162,6 +160,15 @@ class Folder:
|
|
|
162
160
|
)
|
|
163
161
|
|
|
164
162
|
|
|
163
|
+
@dataclasses.dataclass
|
|
164
|
+
class FolderInfo:
|
|
165
|
+
objects: List[Any]
|
|
166
|
+
total_size: int
|
|
167
|
+
min_time: datetime
|
|
168
|
+
max_time: datetime
|
|
169
|
+
latest_obj: Any
|
|
170
|
+
|
|
171
|
+
|
|
165
172
|
@dataclasses.dataclass
|
|
166
173
|
class BrowsePath:
|
|
167
174
|
file: str
|
|
@@ -675,7 +682,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
675
682
|
|
|
676
683
|
logger.info(f"Extracting table schema from file: {table_data.full_path}")
|
|
677
684
|
browse_path: str = (
|
|
678
|
-
strip_s3_prefix(table_data.table_path)
|
|
685
|
+
self.strip_s3_prefix(table_data.table_path)
|
|
679
686
|
if self.is_s3_platform()
|
|
680
687
|
else table_data.table_path.strip("/")
|
|
681
688
|
)
|
|
@@ -860,8 +867,18 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
860
867
|
bucket_name, folder_split[0], self.source_config.aws_config
|
|
861
868
|
)
|
|
862
869
|
for folder in folders:
|
|
870
|
+
# Ensure proper path joining - folder already includes trailing slash from list_folders
|
|
871
|
+
# but we need to handle the case where folder_split[1] might start with a slash
|
|
872
|
+
remaining_pattern = folder_split[1]
|
|
873
|
+
if remaining_pattern.startswith("/"):
|
|
874
|
+
remaining_pattern = remaining_pattern[1:]
|
|
875
|
+
|
|
876
|
+
# Ensure folder ends with slash for proper path construction
|
|
877
|
+
if not folder.endswith("/"):
|
|
878
|
+
folder = folder + "/"
|
|
879
|
+
|
|
863
880
|
yield from self.resolve_templated_folders(
|
|
864
|
-
bucket_name, f"{folder}{
|
|
881
|
+
bucket_name, f"{folder}{remaining_pattern}"
|
|
865
882
|
)
|
|
866
883
|
|
|
867
884
|
def get_dir_to_process(
|
|
@@ -932,26 +949,56 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
932
949
|
"""
|
|
933
950
|
|
|
934
951
|
def _is_allowed_path(path_spec_: PathSpec, s3_uri: str) -> bool:
|
|
935
|
-
|
|
952
|
+
# Normalize URI for pattern matching
|
|
953
|
+
normalized_uri = self._normalize_uri_for_pattern_matching(s3_uri)
|
|
954
|
+
|
|
955
|
+
allowed = path_spec_.allowed(normalized_uri)
|
|
936
956
|
if not allowed:
|
|
937
957
|
logger.debug(f"File {s3_uri} not allowed and skipping")
|
|
938
958
|
self.report.report_file_dropped(s3_uri)
|
|
939
959
|
return allowed
|
|
940
960
|
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
961
|
+
# Process objects in a memory-efficient streaming fashion
|
|
962
|
+
# Instead of loading all objects into memory, we'll accumulate folder data incrementally
|
|
963
|
+
folder_data: Dict[str, FolderInfo] = {} # dirname -> FolderInfo
|
|
964
|
+
|
|
965
|
+
for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
|
|
966
|
+
s3_path = self.create_s3_path(obj.bucket_name, obj.key)
|
|
967
|
+
|
|
968
|
+
if not _is_allowed_path(path_spec, s3_path):
|
|
969
|
+
continue
|
|
970
|
+
|
|
971
|
+
# Extract the directory name (folder) from the object key
|
|
972
|
+
dirname = obj.key.rsplit("/", 1)[0]
|
|
973
|
+
|
|
974
|
+
# Initialize folder data if we haven't seen this directory before
|
|
975
|
+
if dirname not in folder_data:
|
|
976
|
+
folder_data[dirname] = FolderInfo(
|
|
977
|
+
objects=[],
|
|
978
|
+
total_size=0,
|
|
979
|
+
min_time=obj.last_modified,
|
|
980
|
+
max_time=obj.last_modified,
|
|
981
|
+
latest_obj=obj,
|
|
982
|
+
)
|
|
983
|
+
|
|
984
|
+
# Update folder statistics incrementally
|
|
985
|
+
folder_info = folder_data[dirname]
|
|
986
|
+
folder_info.objects.append(obj)
|
|
987
|
+
folder_info.total_size += obj.size
|
|
988
|
+
|
|
989
|
+
# Track min/max times and latest object
|
|
990
|
+
if obj.last_modified < folder_info.min_time:
|
|
991
|
+
folder_info.min_time = obj.last_modified
|
|
992
|
+
if obj.last_modified > folder_info.max_time:
|
|
993
|
+
folder_info.max_time = obj.last_modified
|
|
994
|
+
folder_info.latest_obj = obj
|
|
995
|
+
|
|
996
|
+
# Yield folders after processing all objects
|
|
997
|
+
for _dirname, folder_info in folder_data.items():
|
|
998
|
+
latest_obj = folder_info.latest_obj
|
|
999
|
+
max_file_s3_path = self.create_s3_path(
|
|
1000
|
+
latest_obj.bucket_name, latest_obj.key
|
|
946
1001
|
)
|
|
947
|
-
)
|
|
948
|
-
grouped_s3_objects_by_dirname = groupby_unsorted(
|
|
949
|
-
s3_objects,
|
|
950
|
-
key=lambda obj: obj.key.rsplit("/", 1)[0],
|
|
951
|
-
)
|
|
952
|
-
for _, group in grouped_s3_objects_by_dirname:
|
|
953
|
-
max_file = max(group, key=lambda x: x.last_modified)
|
|
954
|
-
max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
|
|
955
1002
|
|
|
956
1003
|
# If partition_id is None, it means the folder is not a partition
|
|
957
1004
|
partition_id = path_spec.get_partition_from_path(max_file_s3_path)
|
|
@@ -959,37 +1006,100 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
959
1006
|
yield Folder(
|
|
960
1007
|
partition_id=partition_id,
|
|
961
1008
|
is_partition=bool(partition_id),
|
|
962
|
-
creation_time=
|
|
963
|
-
modification_time=
|
|
1009
|
+
creation_time=folder_info.min_time,
|
|
1010
|
+
modification_time=folder_info.max_time,
|
|
964
1011
|
sample_file=max_file_s3_path,
|
|
965
|
-
size=
|
|
1012
|
+
size=folder_info.total_size,
|
|
966
1013
|
)
|
|
967
1014
|
|
|
1015
|
+
def create_s3_path(self, bucket_name: str, key: str) -> str:
|
|
1016
|
+
return f"s3://{bucket_name}/{key}"
|
|
1017
|
+
|
|
968
1018
|
def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
|
|
1019
|
+
"""
|
|
1020
|
+
Main entry point for browsing S3 objects and creating table-level datasets.
|
|
1021
|
+
|
|
1022
|
+
This method determines whether to use templated processing (for paths with {table})
|
|
1023
|
+
or simple file-by-file processing (for paths without templates).
|
|
1024
|
+
|
|
1025
|
+
Args:
|
|
1026
|
+
path_spec: Configuration specifying the S3 path pattern to scan
|
|
1027
|
+
sample_size: Number of files to sample (used in simple processing)
|
|
1028
|
+
|
|
1029
|
+
Returns:
|
|
1030
|
+
Iterator of BrowsePath objects representing datasets to be created
|
|
1031
|
+
|
|
1032
|
+
Examples:
|
|
1033
|
+
- Templated: s3://bucket/data/*/{table}/** -> Groups files by table
|
|
1034
|
+
- Simple: s3://bucket/data/*.csv -> Processes individual files
|
|
1035
|
+
"""
|
|
969
1036
|
if self.source_config.aws_config is None:
|
|
970
1037
|
raise ValueError("aws_config not set. Cannot browse s3")
|
|
1038
|
+
|
|
971
1039
|
s3 = self.source_config.aws_config.get_s3_resource(
|
|
972
1040
|
self.source_config.verify_ssl
|
|
973
1041
|
)
|
|
974
1042
|
bucket_name = get_bucket_name(path_spec.include)
|
|
975
|
-
logger.debug(f"Scanning bucket: {bucket_name}")
|
|
976
1043
|
bucket = s3.Bucket(bucket_name)
|
|
977
|
-
|
|
978
|
-
logger.debug(f"Scanning
|
|
1044
|
+
|
|
1045
|
+
logger.debug(f"Scanning bucket: {bucket_name}")
|
|
1046
|
+
logger.info(f"Processing path spec: {path_spec.include}")
|
|
1047
|
+
|
|
1048
|
+
# Check if we have {table} template in the path
|
|
1049
|
+
has_table_template = "{table}" in path_spec.include
|
|
1050
|
+
|
|
1051
|
+
logger.info(f"Has table template: {has_table_template}")
|
|
1052
|
+
|
|
1053
|
+
if has_table_template:
|
|
1054
|
+
logger.info("Using templated path processing")
|
|
1055
|
+
# Always use templated processing when {table} is present
|
|
1056
|
+
# This groups files under table-level datasets
|
|
1057
|
+
yield from self._process_templated_path(path_spec, bucket, bucket_name)
|
|
1058
|
+
else:
|
|
1059
|
+
logger.info("Using simple path processing")
|
|
1060
|
+
# Only use simple processing for non-templated paths
|
|
1061
|
+
# This creates individual file-level datasets
|
|
1062
|
+
yield from self._process_simple_path(path_spec, bucket, bucket_name)
|
|
1063
|
+
|
|
1064
|
+
def _process_templated_path(
|
|
1065
|
+
self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
|
|
1066
|
+
) -> Iterable[BrowsePath]:
|
|
1067
|
+
"""
|
|
1068
|
+
Process S3 paths containing {table} templates to create table-level datasets.
|
|
1069
|
+
|
|
1070
|
+
This method handles complex path patterns with wildcards and templates by:
|
|
1071
|
+
1. Replacing template placeholders with stars (except {table})
|
|
1072
|
+
2. Resolving wildcards in the path up to the {table} marker
|
|
1073
|
+
3. Finding all potential table folders under each resolved path
|
|
1074
|
+
4. Applying configurable partition traversal strategy (ALL, MAX, MIN_MAX)
|
|
1075
|
+
5. Aggregating files from selected partitions under each table
|
|
1076
|
+
6. Creating one dataset per table (not per file)
|
|
1077
|
+
|
|
1078
|
+
Args:
|
|
1079
|
+
path_spec: Path specification with {table} template
|
|
1080
|
+
bucket: S3 bucket resource
|
|
1081
|
+
bucket_name: Name of the S3 bucket
|
|
1082
|
+
|
|
1083
|
+
Yields:
|
|
1084
|
+
BrowsePath: One per table (not per file), containing aggregated metadata
|
|
1085
|
+
"""
|
|
1086
|
+
# Find the part before {table}
|
|
1087
|
+
table_marker = "{table}"
|
|
1088
|
+
if table_marker not in path_spec.include:
|
|
1089
|
+
logger.info("No {table} marker found in path")
|
|
1090
|
+
return
|
|
1091
|
+
|
|
1092
|
+
# STEP 1: Replace template placeholders with stars (except {table}) to enable folder resolution
|
|
1093
|
+
# This is the crucial missing logic from the original implementation
|
|
979
1094
|
matches = re.finditer(r"{\s*\w+\s*}", path_spec.include, re.MULTILINE)
|
|
980
1095
|
matches_list = list(matches)
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
#
|
|
984
|
-
# For example:
|
|
985
|
-
# "s3://my-test-bucket/*/{dept}/*/{table}/*/*.*" -> "s3://my-test-bucket/*/*/*/{table}/*/*.*"
|
|
986
|
-
# We only keep the last template as a marker to know the point util we need to resolve path.
|
|
987
|
-
# After the marker we can safely get sample files for sampling because it is not used in the
|
|
988
|
-
# table name, so we don't need all the files.
|
|
989
|
-
# This speed up processing but we won't be able to get a precise modification date/size/number of files.
|
|
1096
|
+
|
|
1097
|
+
if matches_list:
|
|
1098
|
+
# Replace all templates with stars except keep {table} as the marker
|
|
990
1099
|
max_start: int = -1
|
|
991
1100
|
include: str = path_spec.include
|
|
992
1101
|
max_match: str = ""
|
|
1102
|
+
|
|
993
1103
|
for match in matches_list:
|
|
994
1104
|
pos = include.find(match.group())
|
|
995
1105
|
if pos > max_start:
|
|
@@ -1001,120 +1111,249 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1001
1111
|
if max_match == "{table}":
|
|
1002
1112
|
break
|
|
1003
1113
|
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1114
|
+
logger.info(f"Template replacement: {path_spec.include} -> {include}")
|
|
1115
|
+
else:
|
|
1116
|
+
include = path_spec.include
|
|
1117
|
+
|
|
1118
|
+
# Split the path at {table} to get the prefix that needs wildcard resolution
|
|
1119
|
+
prefix_before_table = include.split(table_marker)[0]
|
|
1120
|
+
# Remove the s3:// and bucket name to get the relative path
|
|
1121
|
+
relative_path = get_bucket_relative_path(prefix_before_table)
|
|
1122
|
+
|
|
1123
|
+
logger.info(f"Prefix before table: {prefix_before_table}")
|
|
1124
|
+
logger.info(f"Relative path for resolution: {relative_path}")
|
|
1125
|
+
|
|
1126
|
+
try:
|
|
1127
|
+
# STEP 2: Resolve ALL wildcards in the path up to {table}
|
|
1128
|
+
# This converts patterns like "data/*/logs/" to actual paths like ["data/2023/logs/", "data/2024/logs/"]
|
|
1129
|
+
table_index = include.find(table_marker)
|
|
1130
|
+
folder_prefix = get_bucket_relative_path(include[:table_index])
|
|
1131
|
+
|
|
1132
|
+
resolved_prefixes = list(
|
|
1133
|
+
self.resolve_templated_folders(bucket_name, folder_prefix)
|
|
1134
|
+
)
|
|
1135
|
+
logger.info(f"Resolved prefixes: {resolved_prefixes}")
|
|
1136
|
+
|
|
1137
|
+
# STEP 3: Process each resolved prefix to find table folders
|
|
1138
|
+
for resolved_prefix in resolved_prefixes:
|
|
1139
|
+
logger.info(f"Processing resolved prefix: {resolved_prefix}")
|
|
1140
|
+
|
|
1141
|
+
# Get all folders that could be tables under this resolved prefix
|
|
1142
|
+
# These are the actual table names (e.g., "users", "events", "logs")
|
|
1143
|
+
table_folders = list(
|
|
1144
|
+
list_folders(
|
|
1145
|
+
bucket_name, resolved_prefix, self.source_config.aws_config
|
|
1146
|
+
)
|
|
1147
|
+
)
|
|
1148
|
+
logger.debug(
|
|
1149
|
+
f"Found table folders under {resolved_prefix}: {table_folders}"
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
# STEP 4: Process each table folder to create a table-level dataset
|
|
1153
|
+
for table_folder in table_folders:
|
|
1154
|
+
# Create the full S3 path for this table
|
|
1155
|
+
table_s3_path = self.create_s3_path(
|
|
1156
|
+
bucket_name, table_folder.rstrip("/")
|
|
1157
|
+
)
|
|
1158
|
+
logger.info(
|
|
1159
|
+
f"Processing table folder: {table_folder} -> {table_s3_path}"
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
# Extract table name using the ORIGINAL path spec pattern matching (not the modified one)
|
|
1163
|
+
# This uses the compiled regex pattern to extract the table name from the full path
|
|
1164
|
+
table_name, table_path = path_spec.extract_table_name_and_path(
|
|
1165
|
+
table_s3_path
|
|
1166
|
+
)
|
|
1167
|
+
|
|
1168
|
+
# Apply table name filtering if configured
|
|
1169
|
+
if not path_spec.tables_filter_pattern.allowed(table_name):
|
|
1170
|
+
logger.debug(f"Table '{table_name}' not allowed and skipping")
|
|
1171
|
+
continue
|
|
1172
|
+
|
|
1173
|
+
# STEP 5: Handle partition traversal based on configuration
|
|
1174
|
+
# Get all partition folders first
|
|
1175
|
+
all_partition_folders = list(
|
|
1176
|
+
list_folders(
|
|
1177
|
+
bucket_name, table_folder, self.source_config.aws_config
|
|
1015
1178
|
)
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
self.report.report_file_dropped(table_path)
|
|
1021
|
-
continue
|
|
1179
|
+
)
|
|
1180
|
+
logger.info(
|
|
1181
|
+
f"Found {len(all_partition_folders)} partition folders under table {table_name} using method {path_spec.traversal_method}"
|
|
1182
|
+
)
|
|
1022
1183
|
|
|
1184
|
+
if all_partition_folders:
|
|
1185
|
+
# Apply the same traversal logic as the original code
|
|
1023
1186
|
dirs_to_process = []
|
|
1024
|
-
|
|
1187
|
+
|
|
1025
1188
|
if path_spec.traversal_method == FolderTraversalMethod.ALL:
|
|
1026
|
-
|
|
1189
|
+
# Process ALL partitions (original behavior)
|
|
1190
|
+
dirs_to_process = all_partition_folders
|
|
1191
|
+
logger.debug(
|
|
1192
|
+
f"Processing ALL {len(all_partition_folders)} partitions"
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1027
1195
|
else:
|
|
1196
|
+
# Use the original get_dir_to_process logic for MIN/MAX
|
|
1197
|
+
protocol = "s3://" # Default protocol for S3
|
|
1198
|
+
|
|
1028
1199
|
if (
|
|
1029
1200
|
path_spec.traversal_method
|
|
1030
1201
|
== FolderTraversalMethod.MIN_MAX
|
|
1031
1202
|
or path_spec.traversal_method
|
|
1032
1203
|
== FolderTraversalMethod.MAX
|
|
1033
1204
|
):
|
|
1034
|
-
|
|
1035
|
-
path_spec.include
|
|
1036
|
-
)
|
|
1205
|
+
# Get MAX partition using original logic
|
|
1037
1206
|
dirs_to_process_max = self.get_dir_to_process(
|
|
1038
1207
|
bucket_name=bucket_name,
|
|
1039
|
-
folder=
|
|
1208
|
+
folder=table_folder + "/",
|
|
1040
1209
|
path_spec=path_spec,
|
|
1041
1210
|
protocol=protocol,
|
|
1211
|
+
min=False,
|
|
1042
1212
|
)
|
|
1043
|
-
|
|
1213
|
+
if dirs_to_process_max:
|
|
1214
|
+
# Convert full S3 paths back to relative paths for processing
|
|
1215
|
+
dirs_to_process.extend(
|
|
1216
|
+
[
|
|
1217
|
+
d.replace(f"{protocol}{bucket_name}/", "")
|
|
1218
|
+
for d in dirs_to_process_max
|
|
1219
|
+
]
|
|
1220
|
+
)
|
|
1221
|
+
logger.debug(
|
|
1222
|
+
f"Added MAX partition: {dirs_to_process_max}"
|
|
1223
|
+
)
|
|
1044
1224
|
|
|
1045
1225
|
if (
|
|
1046
1226
|
path_spec.traversal_method
|
|
1047
1227
|
== FolderTraversalMethod.MIN_MAX
|
|
1048
1228
|
):
|
|
1229
|
+
# Get MIN partition using original logic
|
|
1049
1230
|
dirs_to_process_min = self.get_dir_to_process(
|
|
1050
1231
|
bucket_name=bucket_name,
|
|
1051
|
-
folder=
|
|
1232
|
+
folder=table_folder + "/",
|
|
1052
1233
|
path_spec=path_spec,
|
|
1053
1234
|
protocol=protocol,
|
|
1054
1235
|
min=True,
|
|
1055
1236
|
)
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1237
|
+
if dirs_to_process_min:
|
|
1238
|
+
# Convert full S3 paths back to relative paths for processing
|
|
1239
|
+
dirs_to_process.extend(
|
|
1240
|
+
[
|
|
1241
|
+
d.replace(f"{protocol}{bucket_name}/", "")
|
|
1242
|
+
for d in dirs_to_process_min
|
|
1243
|
+
]
|
|
1244
|
+
)
|
|
1245
|
+
logger.debug(
|
|
1246
|
+
f"Added MIN partition: {dirs_to_process_min}"
|
|
1247
|
+
)
|
|
1248
|
+
|
|
1249
|
+
# Process the selected partitions
|
|
1250
|
+
all_folders = []
|
|
1251
|
+
for partition_folder in dirs_to_process:
|
|
1252
|
+
# Ensure we have a clean folder path
|
|
1253
|
+
clean_folder = partition_folder.rstrip("/")
|
|
1254
|
+
|
|
1255
|
+
logger.info(f"Scanning files in partition: {clean_folder}")
|
|
1256
|
+
partition_files = list(
|
|
1257
|
+
self.get_folder_info(path_spec, bucket, clean_folder)
|
|
1258
|
+
)
|
|
1259
|
+
all_folders.extend(partition_files)
|
|
1260
|
+
|
|
1261
|
+
if all_folders:
|
|
1262
|
+
# Use the most recent file across all processed partitions
|
|
1263
|
+
latest_file = max(
|
|
1264
|
+
all_folders, key=lambda x: x.modification_time
|
|
1265
|
+
)
|
|
1266
|
+
|
|
1267
|
+
# Get partition information
|
|
1268
|
+
partitions = [f for f in all_folders if f.is_partition]
|
|
1269
|
+
|
|
1270
|
+
# Calculate total size of processed partitions
|
|
1271
|
+
total_size = sum(f.size for f in all_folders)
|
|
1272
|
+
|
|
1273
|
+
# Create ONE BrowsePath per table
|
|
1274
|
+
# The key insight: we need to provide the sample file for schema inference
|
|
1275
|
+
# but the table path should be extracted correctly by extract_table_name_and_path
|
|
1276
|
+
yield BrowsePath(
|
|
1277
|
+
file=latest_file.sample_file, # Sample file for schema inference
|
|
1278
|
+
timestamp=latest_file.modification_time, # Latest timestamp
|
|
1279
|
+
size=total_size, # Size of processed partitions
|
|
1280
|
+
partitions=partitions, # Partition metadata
|
|
1066
1281
|
)
|
|
1067
|
-
|
|
1068
|
-
if folders:
|
|
1069
|
-
max_folder = max(folders, key=lambda x: x.modification_time)
|
|
1070
|
-
if not max_folder:
|
|
1282
|
+
else:
|
|
1071
1283
|
logger.warning(
|
|
1072
|
-
f"
|
|
1284
|
+
f"No files found in processed partitions for table {table_name}"
|
|
1073
1285
|
)
|
|
1074
|
-
continue
|
|
1075
|
-
|
|
1076
|
-
partitions = list(filter(lambda x: x.is_partition, folders))
|
|
1077
|
-
yield BrowsePath(
|
|
1078
|
-
file=max_folder.sample_file,
|
|
1079
|
-
timestamp=max_folder.modification_time,
|
|
1080
|
-
size=max_folder.size,
|
|
1081
|
-
partitions=partitions,
|
|
1082
|
-
# TODO: Support content type inference for partitions
|
|
1083
|
-
)
|
|
1084
|
-
except Exception as e:
|
|
1085
|
-
# This odd check if being done because boto does not have a proper exception to catch
|
|
1086
|
-
# The exception that appears in stacktrace cannot actually be caught without a lot more work
|
|
1087
|
-
# https://github.com/boto/boto3/issues/1195
|
|
1088
|
-
if "NoSuchBucket" in repr(e):
|
|
1089
|
-
logger.debug(f"Got NoSuchBucket exception for {bucket_name}", e)
|
|
1090
|
-
self.get_report().report_warning(
|
|
1091
|
-
"Missing bucket", f"No bucket found {bucket_name}"
|
|
1092
|
-
)
|
|
1093
1286
|
else:
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
logger.debug(f"Path: {s3_path}")
|
|
1103
|
-
|
|
1104
|
-
content_type = None
|
|
1105
|
-
if self.source_config.use_s3_content_type:
|
|
1106
|
-
content_type = s3.Object(obj.bucket_name, obj.key).content_type
|
|
1107
|
-
|
|
1108
|
-
yield BrowsePath(
|
|
1109
|
-
file=s3_path,
|
|
1110
|
-
timestamp=obj.last_modified,
|
|
1111
|
-
size=obj.size,
|
|
1112
|
-
partitions=[],
|
|
1113
|
-
content_type=content_type,
|
|
1287
|
+
logger.warning(
|
|
1288
|
+
f"No partition folders found under table {table_name}"
|
|
1289
|
+
)
|
|
1290
|
+
|
|
1291
|
+
except Exception as e:
|
|
1292
|
+
if "NoSuchBucket" in repr(e):
|
|
1293
|
+
self.get_report().report_warning(
|
|
1294
|
+
"Missing bucket", f"No bucket found {bucket_name}"
|
|
1114
1295
|
)
|
|
1296
|
+
return
|
|
1297
|
+
logger.error(f"Error in _process_templated_path: {e}")
|
|
1298
|
+
raise e
|
|
1115
1299
|
|
|
1116
|
-
def
|
|
1117
|
-
|
|
1300
|
+
def _process_simple_path(
|
|
1301
|
+
self, path_spec: PathSpec, bucket: "Bucket", bucket_name: str
|
|
1302
|
+
) -> Iterable[BrowsePath]:
|
|
1303
|
+
"""
|
|
1304
|
+
Process simple S3 paths without {table} templates to create file-level datasets.
|
|
1305
|
+
|
|
1306
|
+
This method handles straightforward file patterns by:
|
|
1307
|
+
1. Listing all files matching the pattern
|
|
1308
|
+
2. Creating one dataset per file
|
|
1309
|
+
3. No aggregation or grouping is performed
|
|
1310
|
+
|
|
1311
|
+
Use Cases:
|
|
1312
|
+
- Individual file processing: s3://bucket/data/*.csv
|
|
1313
|
+
- Direct file paths: s3://bucket/data/myfile.json
|
|
1314
|
+
- Patterns without table grouping: s3://bucket/logs/*.log
|
|
1315
|
+
|
|
1316
|
+
Args:
|
|
1317
|
+
path_spec: Path specification without {table} template
|
|
1318
|
+
bucket: S3 bucket resource
|
|
1319
|
+
bucket_name: Name of the S3 bucket
|
|
1320
|
+
|
|
1321
|
+
Yields:
|
|
1322
|
+
BrowsePath: One per file, containing individual file metadata
|
|
1323
|
+
|
|
1324
|
+
Example Output:
|
|
1325
|
+
- BrowsePath(file="data/file1.csv", size=1000, partitions=[])
|
|
1326
|
+
- BrowsePath(file="data/file2.csv", size=2000, partitions=[])
|
|
1327
|
+
"""
|
|
1328
|
+
assert self.source_config.aws_config is not None, "aws_config not set"
|
|
1329
|
+
|
|
1330
|
+
path_spec.sample_files = False # Disable sampling for simple paths
|
|
1331
|
+
|
|
1332
|
+
# Extract the prefix from the path spec (stops at first wildcard)
|
|
1333
|
+
prefix = self.get_prefix(get_bucket_relative_path(path_spec.include))
|
|
1334
|
+
|
|
1335
|
+
# Get s3 resource for content type checking
|
|
1336
|
+
s3 = self.source_config.aws_config.get_s3_resource(
|
|
1337
|
+
self.source_config.verify_ssl
|
|
1338
|
+
)
|
|
1339
|
+
|
|
1340
|
+
# Iterate through all objects in the bucket matching the prefix
|
|
1341
|
+
for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE):
|
|
1342
|
+
s3_path = self.create_s3_path(obj.bucket_name, obj.key)
|
|
1343
|
+
|
|
1344
|
+
# Get content type if configured
|
|
1345
|
+
content_type = None
|
|
1346
|
+
if self.source_config.use_s3_content_type:
|
|
1347
|
+
content_type = s3.Object(obj.bucket_name, obj.key).content_type
|
|
1348
|
+
|
|
1349
|
+
# Create one BrowsePath per file
|
|
1350
|
+
yield BrowsePath(
|
|
1351
|
+
file=s3_path,
|
|
1352
|
+
timestamp=obj.last_modified,
|
|
1353
|
+
size=obj.size,
|
|
1354
|
+
partitions=[], # No partitions in simple mode
|
|
1355
|
+
content_type=content_type,
|
|
1356
|
+
)
|
|
1118
1357
|
|
|
1119
1358
|
def local_browser(self, path_spec: PathSpec) -> Iterable[BrowsePath]:
|
|
1120
1359
|
prefix = self.get_prefix(path_spec.include)
|
|
@@ -1158,8 +1397,13 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1158
1397
|
)
|
|
1159
1398
|
table_dict: Dict[str, TableData] = {}
|
|
1160
1399
|
for browse_path in file_browser:
|
|
1400
|
+
# Normalize URI for pattern matching
|
|
1401
|
+
normalized_file_path = self._normalize_uri_for_pattern_matching(
|
|
1402
|
+
browse_path.file
|
|
1403
|
+
)
|
|
1404
|
+
|
|
1161
1405
|
if not path_spec.allowed(
|
|
1162
|
-
|
|
1406
|
+
normalized_file_path,
|
|
1163
1407
|
ignore_ext=self.is_s3_platform()
|
|
1164
1408
|
and self.source_config.use_s3_content_type,
|
|
1165
1409
|
):
|
|
@@ -1235,5 +1479,13 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
1235
1479
|
def is_s3_platform(self):
|
|
1236
1480
|
return self.source_config.platform == "s3"
|
|
1237
1481
|
|
|
1482
|
+
def strip_s3_prefix(self, s3_uri: str) -> str:
|
|
1483
|
+
"""Strip S3 prefix from URI. Can be overridden by adapters for other platforms."""
|
|
1484
|
+
return strip_s3_prefix(s3_uri)
|
|
1485
|
+
|
|
1486
|
+
def _normalize_uri_for_pattern_matching(self, uri: str) -> str:
|
|
1487
|
+
"""Normalize URI for pattern matching. Can be overridden by adapters for other platforms."""
|
|
1488
|
+
return uri
|
|
1489
|
+
|
|
1238
1490
|
def get_report(self):
|
|
1239
1491
|
return self.report
|
|
@@ -178,7 +178,9 @@ class SACSourceReport(StaleEntityRemovalSourceReport):
|
|
|
178
178
|
SourceCapability.LINEAGE_COARSE,
|
|
179
179
|
"Enabled by default (only for Live Data Models)",
|
|
180
180
|
)
|
|
181
|
-
@capability(
|
|
181
|
+
@capability(
|
|
182
|
+
SourceCapability.DELETION_DETECTION, "Enabled by default via stateful ingestion"
|
|
183
|
+
)
|
|
182
184
|
@capability(
|
|
183
185
|
SourceCapability.SCHEMA_METADATA,
|
|
184
186
|
"Enabled by default (only for Import Data Models)",
|
|
@@ -33,7 +33,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
33
33
|
)
|
|
34
34
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
35
35
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
36
|
-
from datahub.ingestion.source.common.subtypes import
|
|
36
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
37
|
+
DatasetSubTypes,
|
|
38
|
+
SourceCapabilityModifier,
|
|
39
|
+
)
|
|
37
40
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
38
41
|
StaleEntityRemovalHandler,
|
|
39
42
|
StaleEntityRemovalSourceReport,
|
|
@@ -532,11 +535,11 @@ class SalesforceApi:
|
|
|
532
535
|
@capability(
|
|
533
536
|
capability_name=SourceCapability.DATA_PROFILING,
|
|
534
537
|
description="Only table level profiling is supported via `profiling.enabled` config field",
|
|
538
|
+
subtype_modifier=[SourceCapabilityModifier.TABLE],
|
|
535
539
|
)
|
|
536
540
|
@capability(
|
|
537
541
|
capability_name=SourceCapability.DELETION_DETECTION,
|
|
538
|
-
description="
|
|
539
|
-
supported=False,
|
|
542
|
+
description="Enabled by default via stateful ingestion",
|
|
540
543
|
)
|
|
541
544
|
@capability(
|
|
542
545
|
capability_name=SourceCapability.SCHEMA_METADATA,
|
|
@@ -30,6 +30,7 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
|
30
30
|
from datahub.ingestion.source.common.subtypes import (
|
|
31
31
|
BIContainerSubTypes,
|
|
32
32
|
DatasetSubTypes,
|
|
33
|
+
SourceCapabilityModifier,
|
|
33
34
|
)
|
|
34
35
|
from datahub.ingestion.source.sigma.config import (
|
|
35
36
|
PlatformDetail,
|
|
@@ -95,7 +96,11 @@ logger = logging.getLogger(__name__)
|
|
|
95
96
|
@platform_name("Sigma")
|
|
96
97
|
@config_class(SigmaSourceConfig)
|
|
97
98
|
@support_status(SupportStatus.INCUBATING)
|
|
98
|
-
@capability(
|
|
99
|
+
@capability(
|
|
100
|
+
SourceCapability.CONTAINERS,
|
|
101
|
+
"Enabled by default",
|
|
102
|
+
subtype_modifier=[SourceCapabilityModifier.SIGMA_WORKSPACE],
|
|
103
|
+
)
|
|
99
104
|
@capability(SourceCapability.DESCRIPTIONS, "Enabled by default")
|
|
100
105
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default.")
|
|
101
106
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
@@ -105,6 +110,7 @@ logger = logging.getLogger(__name__)
|
|
|
105
110
|
SourceCapability.OWNERSHIP,
|
|
106
111
|
"Enabled by default, configured using `ingest_owner`",
|
|
107
112
|
)
|
|
113
|
+
@capability(SourceCapability.TEST_CONNECTION, "Enabled by default")
|
|
108
114
|
class SigmaSource(StatefulIngestionSourceBase, TestableSource):
|
|
109
115
|
"""
|
|
110
116
|
This plugin extracts the following:
|