acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/METADATA +2511 -2484
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/RECORD +223 -189
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/configuration/pydantic_migration_helpers.py +7 -5
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1522 -569
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17758 -17097
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/sdk/search_filters.py +95 -27
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +56 -14
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1.dist-info}/top_level.txt +0 -0
|
@@ -25,6 +25,12 @@ from pydantic import validator
|
|
|
25
25
|
from pydantic.fields import Field
|
|
26
26
|
|
|
27
27
|
from datahub.api.entities.dataset.dataset import Dataset
|
|
28
|
+
from datahub.api.entities.external.external_entities import (
|
|
29
|
+
PlatformResourceRepository,
|
|
30
|
+
)
|
|
31
|
+
from datahub.api.entities.external.lake_formation_external_entites import (
|
|
32
|
+
LakeFormationTag,
|
|
33
|
+
)
|
|
28
34
|
from datahub.configuration.common import AllowDenyPattern
|
|
29
35
|
from datahub.configuration.source_common import DatasetSourceConfigMixin
|
|
30
36
|
from datahub.emitter import mce_builder
|
|
@@ -62,6 +68,10 @@ from datahub.ingestion.source.aws.s3_util import (
|
|
|
62
68
|
make_s3_urn,
|
|
63
69
|
make_s3_urn_for_lineage,
|
|
64
70
|
)
|
|
71
|
+
from datahub.ingestion.source.aws.tag_entities import (
|
|
72
|
+
LakeFormationTagPlatformResource,
|
|
73
|
+
LakeFormationTagPlatformResourceId,
|
|
74
|
+
)
|
|
65
75
|
from datahub.ingestion.source.common.subtypes import (
|
|
66
76
|
DatasetContainerSubTypes,
|
|
67
77
|
DatasetSubTypes,
|
|
@@ -114,6 +124,7 @@ from datahub.metadata.schema_classes import (
|
|
|
114
124
|
from datahub.utilities.delta import delta_type_to_hive_type
|
|
115
125
|
from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column
|
|
116
126
|
from datahub.utilities.lossy_collections import LossyList
|
|
127
|
+
from datahub.utilities.urns.error import InvalidUrnError
|
|
117
128
|
|
|
118
129
|
logger = logging.getLogger(__name__)
|
|
119
130
|
|
|
@@ -168,6 +179,12 @@ class GlueSourceConfig(
|
|
|
168
179
|
default=False,
|
|
169
180
|
description="If an S3 Objects Tags should be created for the Tables ingested by Glue.",
|
|
170
181
|
)
|
|
182
|
+
|
|
183
|
+
extract_lakeformation_tags: Optional[bool] = Field(
|
|
184
|
+
default=False,
|
|
185
|
+
description="When True, extracts Lake Formation tags directly assigned to Glue tables/databases. Note: Tags inherited from databases or other parent resources are excluded.",
|
|
186
|
+
)
|
|
187
|
+
|
|
171
188
|
profiling: GlueProfilingConfig = Field(
|
|
172
189
|
default_factory=GlueProfilingConfig,
|
|
173
190
|
description="Configs to ingest data profiles from glue table",
|
|
@@ -176,6 +193,7 @@ class GlueSourceConfig(
|
|
|
176
193
|
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
|
|
177
194
|
default=None, description=""
|
|
178
195
|
)
|
|
196
|
+
|
|
179
197
|
extract_delta_schema_from_parameters: Optional[bool] = Field(
|
|
180
198
|
default=False,
|
|
181
199
|
description="If enabled, delta schemas can be alternatively fetched from table parameters.",
|
|
@@ -199,6 +217,10 @@ class GlueSourceConfig(
|
|
|
199
217
|
def s3_client(self):
|
|
200
218
|
return self.get_s3_client()
|
|
201
219
|
|
|
220
|
+
@property
|
|
221
|
+
def lakeformation_client(self):
|
|
222
|
+
return self.get_lakeformation_client()
|
|
223
|
+
|
|
202
224
|
@validator("glue_s3_lineage_direction")
|
|
203
225
|
def check_direction(cls, v: str) -> str:
|
|
204
226
|
if v.lower() not in ["upstream", "downstream"]:
|
|
@@ -247,7 +269,7 @@ class GlueSourceReport(StaleEntityRemovalSourceReport):
|
|
|
247
269
|
@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field")
|
|
248
270
|
@capability(
|
|
249
271
|
SourceCapability.DELETION_DETECTION,
|
|
250
|
-
"Enabled by default
|
|
272
|
+
"Enabled by default via stateful ingestion.",
|
|
251
273
|
)
|
|
252
274
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
253
275
|
@capability(
|
|
@@ -311,6 +333,8 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
311
333
|
source_config: GlueSourceConfig
|
|
312
334
|
report: GlueSourceReport
|
|
313
335
|
|
|
336
|
+
lf_tag_cache: Dict[str, Dict[str, List[str]]] = {}
|
|
337
|
+
|
|
314
338
|
def __init__(self, config: GlueSourceConfig, ctx: PipelineContext):
|
|
315
339
|
super().__init__(config, ctx)
|
|
316
340
|
self.ctx = ctx
|
|
@@ -320,9 +344,114 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
320
344
|
self.report.catalog_id = self.source_config.catalog_id
|
|
321
345
|
self.glue_client = config.glue_client
|
|
322
346
|
self.s3_client = config.s3_client
|
|
347
|
+
# Initialize Lake Formation client
|
|
348
|
+
self.lf_client = config.lakeformation_client
|
|
323
349
|
self.extract_transforms = config.extract_transforms
|
|
324
350
|
self.env = config.env
|
|
325
351
|
|
|
352
|
+
self.platform_resource_repository: Optional[PlatformResourceRepository] = None
|
|
353
|
+
if self.ctx.graph:
|
|
354
|
+
self.platform_resource_repository = PlatformResourceRepository(
|
|
355
|
+
self.ctx.graph
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
def get_database_lf_tags(
|
|
359
|
+
self,
|
|
360
|
+
catalog_id: str,
|
|
361
|
+
database_name: str,
|
|
362
|
+
) -> List[LakeFormationTag]:
|
|
363
|
+
"""Get all LF tags for a specific table."""
|
|
364
|
+
try:
|
|
365
|
+
# Get LF tags for the specified table
|
|
366
|
+
response = self.lf_client.get_resource_lf_tags(
|
|
367
|
+
CatalogId=catalog_id,
|
|
368
|
+
Resource={
|
|
369
|
+
"Database": {
|
|
370
|
+
"CatalogId": catalog_id,
|
|
371
|
+
"Name": database_name,
|
|
372
|
+
}
|
|
373
|
+
},
|
|
374
|
+
ShowAssignedLFTags=True,
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
if response:
|
|
378
|
+
logger.info(f"LF tags for database {database_name}: {response}")
|
|
379
|
+
# Extract and return the LF tags
|
|
380
|
+
lf_tags = response.get("LFTagOnDatabase", [])
|
|
381
|
+
|
|
382
|
+
tags = []
|
|
383
|
+
for lf_tag in lf_tags:
|
|
384
|
+
catalog_id = lf_tag.get("CatalogId")
|
|
385
|
+
tag_key = lf_tag.get("TagKey")
|
|
386
|
+
for tag_value in lf_tag.get("TagValues", []):
|
|
387
|
+
t = LakeFormationTag(
|
|
388
|
+
key=tag_key,
|
|
389
|
+
value=tag_value,
|
|
390
|
+
catalog_id=catalog_id,
|
|
391
|
+
)
|
|
392
|
+
tags.append(t)
|
|
393
|
+
return tags
|
|
394
|
+
|
|
395
|
+
except Exception as e:
|
|
396
|
+
print(
|
|
397
|
+
f"Error getting LF tags for table {catalog_id}.{database_name}: {str(e)}"
|
|
398
|
+
)
|
|
399
|
+
return []
|
|
400
|
+
|
|
401
|
+
def get_table_lf_tags(
|
|
402
|
+
self,
|
|
403
|
+
catalog_id: str,
|
|
404
|
+
database_name: str,
|
|
405
|
+
table_name: str,
|
|
406
|
+
) -> List[LakeFormationTag]:
|
|
407
|
+
"""Get all LF tags for a specific table."""
|
|
408
|
+
try:
|
|
409
|
+
# Get LF tags for the specified table
|
|
410
|
+
response = self.lf_client.get_resource_lf_tags(
|
|
411
|
+
CatalogId=catalog_id,
|
|
412
|
+
Resource={
|
|
413
|
+
"Table": {
|
|
414
|
+
"CatalogId": catalog_id,
|
|
415
|
+
"DatabaseName": database_name,
|
|
416
|
+
"Name": table_name,
|
|
417
|
+
},
|
|
418
|
+
},
|
|
419
|
+
ShowAssignedLFTags=True,
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
# Extract and return the LF tags
|
|
423
|
+
lf_tags = response.get("LFTagsOnTable", [])
|
|
424
|
+
|
|
425
|
+
tags = []
|
|
426
|
+
for lf_tag in lf_tags:
|
|
427
|
+
catalog_id = lf_tag.get("CatalogId")
|
|
428
|
+
tag_key = lf_tag.get("TagKey")
|
|
429
|
+
for tag_value in lf_tag.get("TagValues", []):
|
|
430
|
+
t = LakeFormationTag(
|
|
431
|
+
key=tag_key,
|
|
432
|
+
value=tag_value,
|
|
433
|
+
catalog_id=catalog_id,
|
|
434
|
+
)
|
|
435
|
+
tags.append(t)
|
|
436
|
+
return tags
|
|
437
|
+
|
|
438
|
+
except Exception:
|
|
439
|
+
return []
|
|
440
|
+
|
|
441
|
+
def get_all_lf_tags(self) -> List:
|
|
442
|
+
# 1. Get all LF-Tags in your account (metadata only)
|
|
443
|
+
response = self.lf_client.list_lf_tags(
|
|
444
|
+
MaxResults=50 # Adjust as needed
|
|
445
|
+
)
|
|
446
|
+
all_lf_tags = response["LFTags"]
|
|
447
|
+
# Continue pagination if necessary
|
|
448
|
+
while "NextToken" in response:
|
|
449
|
+
response = self.lf_client.list_lf_tags(
|
|
450
|
+
NextToken=response["NextToken"], MaxResults=50
|
|
451
|
+
)
|
|
452
|
+
all_lf_tags.extend(response["LFTags"])
|
|
453
|
+
return all_lf_tags
|
|
454
|
+
|
|
326
455
|
def get_glue_arn(
|
|
327
456
|
self, account_id: str, database: str, table: Optional[str] = None
|
|
328
457
|
) -> str:
|
|
@@ -869,7 +998,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
869
998
|
table_stats: dict,
|
|
870
999
|
column_stats: dict,
|
|
871
1000
|
partition_spec: Optional[str] = None,
|
|
872
|
-
) -> MetadataChangeProposalWrapper:
|
|
1001
|
+
) -> Optional[MetadataChangeProposalWrapper]:
|
|
873
1002
|
assert self.source_config.profiling
|
|
874
1003
|
|
|
875
1004
|
# instantiate profile class
|
|
@@ -936,6 +1065,14 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
936
1065
|
|
|
937
1066
|
dataset_profile.fieldProfiles.append(column_profile)
|
|
938
1067
|
|
|
1068
|
+
# if no stats are available, skip ingestion
|
|
1069
|
+
if (
|
|
1070
|
+
not dataset_profile.fieldProfiles
|
|
1071
|
+
and dataset_profile.rowCount is None
|
|
1072
|
+
and dataset_profile.columnCount is None
|
|
1073
|
+
):
|
|
1074
|
+
return None
|
|
1075
|
+
|
|
939
1076
|
if partition_spec:
|
|
940
1077
|
# inject partition level stats
|
|
941
1078
|
dataset_profile.partitionSpec = PartitionSpecClass(
|
|
@@ -990,18 +1127,20 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
990
1127
|
if self.source_config.profiling.partition_patterns.allowed(
|
|
991
1128
|
partition_spec
|
|
992
1129
|
):
|
|
993
|
-
|
|
1130
|
+
profile_mcp = self._create_profile_mcp(
|
|
994
1131
|
mce, table_stats, column_stats, partition_spec
|
|
995
|
-
)
|
|
1132
|
+
)
|
|
1133
|
+
if profile_mcp:
|
|
1134
|
+
yield profile_mcp.as_workunit()
|
|
996
1135
|
else:
|
|
997
1136
|
continue
|
|
998
1137
|
else:
|
|
999
1138
|
# ingest data profile without partition
|
|
1000
1139
|
table_stats = response["Table"]["Parameters"]
|
|
1001
1140
|
column_stats = response["Table"]["StorageDescriptor"]["Columns"]
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1141
|
+
profile_mcp = self._create_profile_mcp(mce, table_stats, column_stats)
|
|
1142
|
+
if profile_mcp:
|
|
1143
|
+
yield profile_mcp.as_workunit()
|
|
1005
1144
|
|
|
1006
1145
|
def gen_database_key(self, database: str) -> DatabaseKey:
|
|
1007
1146
|
return DatabaseKey(
|
|
@@ -1012,9 +1151,66 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1012
1151
|
backcompat_env_as_instance=True,
|
|
1013
1152
|
)
|
|
1014
1153
|
|
|
1154
|
+
def gen_platform_resource(
|
|
1155
|
+
self, tag: LakeFormationTag
|
|
1156
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1157
|
+
if self.ctx.graph and self.platform_resource_repository:
|
|
1158
|
+
platform_resource_id = LakeFormationTagPlatformResourceId.from_tag(
|
|
1159
|
+
platform_instance=self.source_config.platform_instance,
|
|
1160
|
+
platform_resource_repository=self.platform_resource_repository,
|
|
1161
|
+
catalog=tag.catalog,
|
|
1162
|
+
tag=tag,
|
|
1163
|
+
)
|
|
1164
|
+
logger.info(f"Created platform resource {platform_resource_id}")
|
|
1165
|
+
|
|
1166
|
+
lf_tag = LakeFormationTagPlatformResource.get_from_datahub(
|
|
1167
|
+
platform_resource_id, self.platform_resource_repository, False
|
|
1168
|
+
)
|
|
1169
|
+
if (
|
|
1170
|
+
tag.to_datahub_tag_urn().urn()
|
|
1171
|
+
not in lf_tag.datahub_linked_resources().urns
|
|
1172
|
+
):
|
|
1173
|
+
try:
|
|
1174
|
+
lf_tag.datahub_linked_resources().add(
|
|
1175
|
+
tag.to_datahub_tag_urn().urn()
|
|
1176
|
+
)
|
|
1177
|
+
platform_resource = lf_tag.as_platform_resource()
|
|
1178
|
+
for mcp in platform_resource.to_mcps():
|
|
1179
|
+
yield MetadataWorkUnit(
|
|
1180
|
+
id=f"platform_resource-{platform_resource.id}",
|
|
1181
|
+
mcp=mcp,
|
|
1182
|
+
)
|
|
1183
|
+
except Exception as e:
|
|
1184
|
+
logger.warning(
|
|
1185
|
+
f"Failed to create platform resource for tag {tag}: {e}",
|
|
1186
|
+
exc_info=True,
|
|
1187
|
+
)
|
|
1188
|
+
self.report.report_warning(
|
|
1189
|
+
context="Failed to create platform resource",
|
|
1190
|
+
message=f"Failed to create platform resource for Tag: {tag}",
|
|
1191
|
+
)
|
|
1192
|
+
|
|
1015
1193
|
def gen_database_containers(
|
|
1016
1194
|
self, database: Mapping[str, Any]
|
|
1017
1195
|
) -> Iterable[MetadataWorkUnit]:
|
|
1196
|
+
container_tags: Optional[List] = None
|
|
1197
|
+
if self.source_config.extract_lakeformation_tags:
|
|
1198
|
+
try:
|
|
1199
|
+
tags = self.get_database_lf_tags(
|
|
1200
|
+
catalog_id=database["CatalogId"], database_name=database["Name"]
|
|
1201
|
+
)
|
|
1202
|
+
container_tags = []
|
|
1203
|
+
for tag in tags:
|
|
1204
|
+
try:
|
|
1205
|
+
container_tags.append(tag.to_datahub_tag_urn().name)
|
|
1206
|
+
yield from self.gen_platform_resource(tag)
|
|
1207
|
+
except InvalidUrnError:
|
|
1208
|
+
continue
|
|
1209
|
+
except Exception:
|
|
1210
|
+
self.report_warning(
|
|
1211
|
+
reason="Failed to extract Lake Formation tags for database",
|
|
1212
|
+
key=database["Name"],
|
|
1213
|
+
)
|
|
1018
1214
|
domain_urn = self._gen_domain_urn(database["Name"])
|
|
1019
1215
|
database_container_key = self.gen_database_key(database["Name"])
|
|
1020
1216
|
parameters = database.get("Parameters", {})
|
|
@@ -1032,6 +1228,7 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1032
1228
|
qualified_name=self.get_glue_arn(
|
|
1033
1229
|
account_id=database["CatalogId"], database=database["Name"]
|
|
1034
1230
|
),
|
|
1231
|
+
tags=container_tags,
|
|
1035
1232
|
extra_properties=parameters,
|
|
1036
1233
|
)
|
|
1037
1234
|
|
|
@@ -1106,9 +1303,8 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1106
1303
|
platform_instance=self.source_config.platform_instance,
|
|
1107
1304
|
)
|
|
1108
1305
|
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1306
|
+
yield from self._extract_record(dataset_urn, table, full_table_name)
|
|
1307
|
+
# generate a Dataset snapshot
|
|
1112
1308
|
# We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
|
|
1113
1309
|
# possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
|
|
1114
1310
|
yield MetadataChangeProposalWrapper(
|
|
@@ -1124,19 +1320,6 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1124
1320
|
dataset_urn=dataset_urn, db_name=database_name
|
|
1125
1321
|
)
|
|
1126
1322
|
|
|
1127
|
-
wu = self.get_lineage_if_enabled(mce)
|
|
1128
|
-
if wu:
|
|
1129
|
-
yield wu
|
|
1130
|
-
|
|
1131
|
-
try:
|
|
1132
|
-
yield from self.get_profile_if_enabled(mce, database_name, table_name)
|
|
1133
|
-
except KeyError as e:
|
|
1134
|
-
self.report.report_failure(
|
|
1135
|
-
message="Failed to extract profile for table",
|
|
1136
|
-
context=f"Table: {dataset_urn}",
|
|
1137
|
-
exc=e,
|
|
1138
|
-
)
|
|
1139
|
-
|
|
1140
1323
|
def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
|
|
1141
1324
|
dags: Dict[str, Optional[Dict[str, Any]]] = {}
|
|
1142
1325
|
flow_names: Dict[str, str] = {}
|
|
@@ -1191,159 +1374,201 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1191
1374
|
for dataset_id, dataset_mce in zip(new_dataset_ids, new_dataset_mces):
|
|
1192
1375
|
yield MetadataWorkUnit(id=dataset_id, mce=dataset_mce)
|
|
1193
1376
|
|
|
1194
|
-
# flake8: noqa: C901
|
|
1195
1377
|
def _extract_record(
|
|
1196
1378
|
self, dataset_urn: str, table: Dict, table_name: str
|
|
1197
|
-
) ->
|
|
1379
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1380
|
+
"""Extract and yield metadata work units for a Glue table."""
|
|
1198
1381
|
logger.debug(
|
|
1199
1382
|
f"extract record from table={table_name} for dataset={dataset_urn}"
|
|
1200
1383
|
)
|
|
1201
1384
|
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
database=table["DatabaseName"],
|
|
1219
|
-
table=table["Name"],
|
|
1220
|
-
),
|
|
1221
|
-
)
|
|
1385
|
+
# Create the main dataset snapshot
|
|
1386
|
+
dataset_snapshot = DatasetSnapshot(
|
|
1387
|
+
urn=dataset_urn,
|
|
1388
|
+
aspects=[
|
|
1389
|
+
Status(removed=False),
|
|
1390
|
+
self._get_dataset_properties(table),
|
|
1391
|
+
],
|
|
1392
|
+
)
|
|
1393
|
+
|
|
1394
|
+
# Add schema metadata if available
|
|
1395
|
+
schema_metadata = self._get_schema_metadata(table, table_name, dataset_urn)
|
|
1396
|
+
if schema_metadata:
|
|
1397
|
+
dataset_snapshot.aspects.append(schema_metadata)
|
|
1398
|
+
|
|
1399
|
+
# Add platform instance
|
|
1400
|
+
dataset_snapshot.aspects.append(self._get_data_platform_instance())
|
|
1222
1401
|
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
if
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1402
|
+
# Add ownership if enabled
|
|
1403
|
+
if self.extract_owners:
|
|
1404
|
+
ownership = GlueSource._get_ownership(table.get("Owner"))
|
|
1405
|
+
if ownership:
|
|
1406
|
+
dataset_snapshot.aspects.append(ownership)
|
|
1407
|
+
|
|
1408
|
+
# Add S3 tags if enabled
|
|
1409
|
+
s3_tags = self._get_s3_tags(table, dataset_urn)
|
|
1410
|
+
if s3_tags:
|
|
1411
|
+
dataset_snapshot.aspects.append(s3_tags)
|
|
1412
|
+
|
|
1413
|
+
# Add Lake Formation tags if enabled
|
|
1414
|
+
if self.source_config.extract_lakeformation_tags:
|
|
1415
|
+
tags = self.get_table_lf_tags(
|
|
1416
|
+
catalog_id=table["CatalogId"],
|
|
1417
|
+
database_name=table["DatabaseName"],
|
|
1418
|
+
table_name=table["Name"],
|
|
1230
1419
|
)
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
tags_to_add.extend(
|
|
1253
|
-
[
|
|
1254
|
-
make_tag_urn(f"""{tag["Key"]}:{tag["Value"]}""")
|
|
1255
|
-
for tag in tag_set
|
|
1256
|
-
]
|
|
1257
|
-
)
|
|
1258
|
-
else:
|
|
1259
|
-
# Unlike bucket tags, if an object does not have tags, it will just return an empty array
|
|
1260
|
-
# as opposed to an exception.
|
|
1261
|
-
logger.warning(
|
|
1262
|
-
f"No tags found for bucket={bucket_name} key={key_prefix}"
|
|
1263
|
-
)
|
|
1264
|
-
if len(tags_to_add) == 0:
|
|
1265
|
-
return None
|
|
1266
|
-
if self.ctx.graph is not None:
|
|
1267
|
-
logger.debug(
|
|
1268
|
-
"Connected to DatahubApi, grabbing current tags to maintain."
|
|
1269
|
-
)
|
|
1270
|
-
current_tags: Optional[GlobalTagsClass] = self.ctx.graph.get_aspect(
|
|
1271
|
-
entity_urn=dataset_urn,
|
|
1272
|
-
aspect_type=GlobalTagsClass,
|
|
1273
|
-
)
|
|
1274
|
-
if current_tags:
|
|
1275
|
-
tags_to_add.extend(
|
|
1276
|
-
[current_tag.tag for current_tag in current_tags.tags]
|
|
1277
|
-
)
|
|
1278
|
-
else:
|
|
1279
|
-
logger.warning(
|
|
1280
|
-
"Could not connect to DatahubApi. No current tags to maintain"
|
|
1281
|
-
)
|
|
1282
|
-
# Remove duplicate tags
|
|
1283
|
-
tags_to_add = sorted(list(set(tags_to_add)))
|
|
1284
|
-
new_tags = GlobalTagsClass(
|
|
1285
|
-
tags=[TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add]
|
|
1420
|
+
|
|
1421
|
+
global_tags = self._get_lake_formation_tags(tags)
|
|
1422
|
+
if global_tags:
|
|
1423
|
+
dataset_snapshot.aspects.append(global_tags)
|
|
1424
|
+
# Generate platform resources for LF tags
|
|
1425
|
+
for tag in tags:
|
|
1426
|
+
yield from self.gen_platform_resource(tag)
|
|
1427
|
+
|
|
1428
|
+
# Create and yield the main metadata work unit
|
|
1429
|
+
metadata_record = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
1430
|
+
yield MetadataWorkUnit(table_name, mce=metadata_record)
|
|
1431
|
+
|
|
1432
|
+
# Add lineage if enabled
|
|
1433
|
+
lineage_wu = self.get_lineage_if_enabled(metadata_record)
|
|
1434
|
+
if lineage_wu:
|
|
1435
|
+
yield lineage_wu
|
|
1436
|
+
|
|
1437
|
+
# Add profile if enabled
|
|
1438
|
+
try:
|
|
1439
|
+
yield from self.get_profile_if_enabled(
|
|
1440
|
+
metadata_record, table["DatabaseName"], table["Name"]
|
|
1286
1441
|
)
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
return (
|
|
1293
|
-
(self.source_config.extract_delta_schema_from_parameters is True)
|
|
1294
|
-
and (provider == "delta")
|
|
1295
|
-
and (num_parts > 0)
|
|
1296
|
-
and (columns is not None)
|
|
1297
|
-
and (len(columns) == 1)
|
|
1298
|
-
and (columns[0].get("Name", "") == "col")
|
|
1299
|
-
and (columns[0].get("Type", "") == "array<string>")
|
|
1442
|
+
except KeyError as e:
|
|
1443
|
+
self.report.report_failure(
|
|
1444
|
+
message="Failed to extract profile for table",
|
|
1445
|
+
context=f"Table: {dataset_urn}",
|
|
1446
|
+
exc=e,
|
|
1300
1447
|
)
|
|
1301
1448
|
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1449
|
+
def _get_dataset_properties(self, table: Dict) -> DatasetPropertiesClass:
|
|
1450
|
+
"""Extract dataset properties from Glue table."""
|
|
1451
|
+
storage_descriptor = table.get("StorageDescriptor", {})
|
|
1452
|
+
custom_properties = {
|
|
1453
|
+
**table.get("Parameters", {}),
|
|
1454
|
+
**{
|
|
1455
|
+
k: str(v)
|
|
1456
|
+
for k, v in storage_descriptor.items()
|
|
1457
|
+
if k not in ["Columns", "Parameters"]
|
|
1458
|
+
},
|
|
1459
|
+
}
|
|
1313
1460
|
|
|
1314
|
-
|
|
1315
|
-
|
|
1461
|
+
return DatasetPropertiesClass(
|
|
1462
|
+
description=table.get("Description"),
|
|
1463
|
+
customProperties=custom_properties,
|
|
1464
|
+
uri=table.get("Location"),
|
|
1465
|
+
tags=[],
|
|
1466
|
+
name=table["Name"],
|
|
1467
|
+
qualifiedName=self.get_glue_arn(
|
|
1468
|
+
account_id=table["CatalogId"],
|
|
1469
|
+
database=table["DatabaseName"],
|
|
1470
|
+
table=table["Name"],
|
|
1471
|
+
),
|
|
1472
|
+
)
|
|
1316
1473
|
|
|
1317
|
-
|
|
1318
|
-
|
|
1474
|
+
def _get_schema_metadata(
|
|
1475
|
+
self, table: Dict, table_name: str, dataset_urn: str
|
|
1476
|
+
) -> Optional[SchemaMetadata]:
|
|
1477
|
+
"""Extract schema metadata from Glue table."""
|
|
1478
|
+
if not table.get("StorageDescriptor"):
|
|
1479
|
+
return None
|
|
1319
1480
|
|
|
1320
|
-
|
|
1321
|
-
|
|
1481
|
+
# Check if this is a delta table with schema in parameters
|
|
1482
|
+
if self._is_delta_schema(table):
|
|
1483
|
+
return self._get_delta_schema_metadata(table, table_name, dataset_urn)
|
|
1484
|
+
else:
|
|
1485
|
+
return self._get_glue_schema_metadata(table, table_name)
|
|
1322
1486
|
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1487
|
+
def _is_delta_schema(self, table: Dict) -> bool:
|
|
1488
|
+
"""Check if table uses delta format with schema in parameters."""
|
|
1489
|
+
if not self.source_config.extract_delta_schema_from_parameters:
|
|
1490
|
+
return False
|
|
1491
|
+
|
|
1492
|
+
provider = table.get("Parameters", {}).get("spark.sql.sources.provider", "")
|
|
1493
|
+
num_parts = int(
|
|
1494
|
+
table.get("Parameters", {}).get("spark.sql.sources.schema.numParts", "0")
|
|
1495
|
+
)
|
|
1496
|
+
columns = table.get("StorageDescriptor", {}).get("Columns", [])
|
|
1497
|
+
|
|
1498
|
+
return (
|
|
1499
|
+
provider == "delta"
|
|
1500
|
+
and num_parts > 0
|
|
1501
|
+
and columns
|
|
1502
|
+
and len(columns) == 1
|
|
1503
|
+
and columns[0].get("Name", "") == "col"
|
|
1504
|
+
and columns[0].get("Type", "") == "array<string>"
|
|
1505
|
+
)
|
|
1506
|
+
|
|
1507
|
+
def _get_glue_schema_metadata(
|
|
1508
|
+
self, table: Dict, table_name: str
|
|
1509
|
+
) -> Optional[SchemaMetadata]:
|
|
1510
|
+
"""Extract schema metadata from Glue table columns."""
|
|
1511
|
+
schema = table["StorageDescriptor"]["Columns"]
|
|
1512
|
+
fields: List[SchemaField] = []
|
|
1513
|
+
|
|
1514
|
+
# Process regular columns
|
|
1515
|
+
for field in schema:
|
|
1516
|
+
schema_fields = get_schema_fields_for_hive_column(
|
|
1517
|
+
hive_column_name=field["Name"],
|
|
1518
|
+
hive_column_type=field["Type"],
|
|
1519
|
+
description=field.get("Comment"),
|
|
1520
|
+
default_nullable=True,
|
|
1521
|
+
)
|
|
1522
|
+
if schema_fields:
|
|
1334
1523
|
fields.extend(schema_fields)
|
|
1335
1524
|
|
|
1336
|
-
|
|
1337
|
-
|
|
1525
|
+
# Process partition keys
|
|
1526
|
+
partition_keys = table.get("PartitionKeys", [])
|
|
1527
|
+
for partition_key in partition_keys:
|
|
1528
|
+
schema_fields = get_schema_fields_for_hive_column(
|
|
1529
|
+
hive_column_name=partition_key["Name"],
|
|
1530
|
+
hive_column_type=partition_key.get("Type", "unknown"),
|
|
1531
|
+
description=partition_key.get("Comment"),
|
|
1532
|
+
default_nullable=False,
|
|
1533
|
+
)
|
|
1534
|
+
if schema_fields:
|
|
1535
|
+
fields.extend(schema_fields)
|
|
1536
|
+
|
|
1537
|
+
return SchemaMetadata(
|
|
1538
|
+
schemaName=table_name,
|
|
1539
|
+
version=0,
|
|
1540
|
+
fields=fields,
|
|
1541
|
+
platform=f"urn:li:dataPlatform:{self.platform}",
|
|
1542
|
+
hash="",
|
|
1543
|
+
platformSchema=MySqlDDL(tableSchema=""),
|
|
1544
|
+
)
|
|
1545
|
+
|
|
1546
|
+
def _get_delta_schema_metadata(
|
|
1547
|
+
self, table: Dict, table_name: str, dataset_urn: str
|
|
1548
|
+
) -> Optional[SchemaMetadata]:
|
|
1549
|
+
"""Extract schema metadata from Delta table parameters."""
|
|
1550
|
+
try:
|
|
1551
|
+
# Reconstruct schema from parameters
|
|
1552
|
+
num_parts = int(table["Parameters"]["spark.sql.sources.schema.numParts"])
|
|
1553
|
+
schema_str = "".join(
|
|
1554
|
+
table["Parameters"][f"spark.sql.sources.schema.part.{i}"]
|
|
1555
|
+
for i in range(num_parts)
|
|
1556
|
+
)
|
|
1557
|
+
schema_json = json.loads(schema_str)
|
|
1558
|
+
|
|
1559
|
+
fields: List[SchemaField] = []
|
|
1560
|
+
for field in schema_json["fields"]:
|
|
1561
|
+
field_type = delta_type_to_hive_type(field.get("type", "unknown"))
|
|
1338
1562
|
schema_fields = get_schema_fields_for_hive_column(
|
|
1339
|
-
hive_column_name=
|
|
1340
|
-
hive_column_type=
|
|
1341
|
-
description=
|
|
1342
|
-
default_nullable=
|
|
1563
|
+
hive_column_name=field["name"],
|
|
1564
|
+
hive_column_type=field_type,
|
|
1565
|
+
description=field.get("description"),
|
|
1566
|
+
default_nullable=bool(field.get("nullable", True)),
|
|
1343
1567
|
)
|
|
1344
|
-
|
|
1345
|
-
|
|
1568
|
+
if schema_fields:
|
|
1569
|
+
fields.extend(schema_fields)
|
|
1346
1570
|
|
|
1571
|
+
self.report.num_dataset_valid_delta_schema += 1
|
|
1347
1572
|
return SchemaMetadata(
|
|
1348
1573
|
schemaName=table_name,
|
|
1349
1574
|
version=0,
|
|
@@ -1353,108 +1578,128 @@ class GlueSource(StatefulIngestionSourceBase):
|
|
|
1353
1578
|
platformSchema=MySqlDDL(tableSchema=""),
|
|
1354
1579
|
)
|
|
1355
1580
|
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1581
|
+
except Exception as e:
|
|
1582
|
+
self.report_warning(
|
|
1583
|
+
dataset_urn,
|
|
1584
|
+
f"Could not parse schema for {table_name} because of {type(e).__name__}: {e}",
|
|
1360
1585
|
)
|
|
1586
|
+
self.report.num_dataset_invalid_delta_schema += 1
|
|
1587
|
+
return None
|
|
1361
1588
|
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1589
|
+
def _get_data_platform_instance(self) -> DataPlatformInstanceClass:
|
|
1590
|
+
"""Get data platform instance aspect."""
|
|
1591
|
+
return DataPlatformInstanceClass(
|
|
1592
|
+
platform=make_data_platform_urn(self.platform),
|
|
1593
|
+
instance=(
|
|
1594
|
+
make_dataplatform_instance_urn(
|
|
1595
|
+
self.platform, self.source_config.platform_instance
|
|
1369
1596
|
)
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
schema_fields = get_schema_fields_for_hive_column(
|
|
1375
|
-
hive_column_name=field["name"],
|
|
1376
|
-
hive_column_type=field_type,
|
|
1377
|
-
description=field.get("description"),
|
|
1378
|
-
default_nullable=bool(field.get("nullable", True)),
|
|
1379
|
-
)
|
|
1380
|
-
assert schema_fields
|
|
1381
|
-
fields.extend(schema_fields)
|
|
1597
|
+
if self.source_config.platform_instance
|
|
1598
|
+
else None
|
|
1599
|
+
),
|
|
1600
|
+
)
|
|
1382
1601
|
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
hash="",
|
|
1390
|
-
platformSchema=MySqlDDL(tableSchema=""),
|
|
1391
|
-
)
|
|
1602
|
+
@staticmethod
|
|
1603
|
+
@lru_cache(maxsize=None)
|
|
1604
|
+
def _get_ownership(owner: str) -> Optional[OwnershipClass]:
|
|
1605
|
+
"""Get ownership aspect for a given owner."""
|
|
1606
|
+
if not owner:
|
|
1607
|
+
return None
|
|
1392
1608
|
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
)
|
|
1398
|
-
self.report.num_dataset_invalid_delta_schema += 1
|
|
1399
|
-
return None
|
|
1400
|
-
|
|
1401
|
-
def get_data_platform_instance() -> DataPlatformInstanceClass:
|
|
1402
|
-
return DataPlatformInstanceClass(
|
|
1403
|
-
platform=make_data_platform_urn(self.platform),
|
|
1404
|
-
instance=(
|
|
1405
|
-
make_dataplatform_instance_urn(
|
|
1406
|
-
self.platform, self.source_config.platform_instance
|
|
1407
|
-
)
|
|
1408
|
-
if self.source_config.platform_instance
|
|
1409
|
-
else None
|
|
1410
|
-
),
|
|
1609
|
+
owners = [
|
|
1610
|
+
OwnerClass(
|
|
1611
|
+
owner=mce_builder.make_user_urn(owner),
|
|
1612
|
+
type=OwnershipTypeClass.DATAOWNER,
|
|
1411
1613
|
)
|
|
1614
|
+
]
|
|
1615
|
+
return OwnershipClass(owners=owners)
|
|
1412
1616
|
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
type=OwnershipTypeClass.DATAOWNER,
|
|
1420
|
-
)
|
|
1421
|
-
]
|
|
1422
|
-
return OwnershipClass(
|
|
1423
|
-
owners=owners,
|
|
1424
|
-
)
|
|
1617
|
+
def _get_s3_tags(self, table: Dict, dataset_urn: str) -> Optional[GlobalTagsClass]:
|
|
1618
|
+
"""Extract S3 tags if enabled."""
|
|
1619
|
+
if not (
|
|
1620
|
+
self.source_config.use_s3_bucket_tags
|
|
1621
|
+
or self.source_config.use_s3_object_tags
|
|
1622
|
+
):
|
|
1425
1623
|
return None
|
|
1426
1624
|
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
get_dataset_properties(),
|
|
1432
|
-
],
|
|
1433
|
-
)
|
|
1625
|
+
# Check if table has a location (VIRTUAL_VIEW tables may not)
|
|
1626
|
+
location = table.get("StorageDescriptor", {}).get("Location")
|
|
1627
|
+
if not location:
|
|
1628
|
+
return None
|
|
1434
1629
|
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
dataset_snapshot.aspects.append(schema_metadata)
|
|
1630
|
+
bucket_name = s3_util.get_bucket_name(location)
|
|
1631
|
+
tags_to_add: List[str] = []
|
|
1438
1632
|
|
|
1439
|
-
|
|
1633
|
+
# Get bucket tags
|
|
1634
|
+
if self.source_config.use_s3_bucket_tags:
|
|
1635
|
+
try:
|
|
1636
|
+
bucket_tags = self.s3_client.get_bucket_tagging(Bucket=bucket_name)
|
|
1637
|
+
tags_to_add.extend(
|
|
1638
|
+
make_tag_urn(f"{tag['Key']}:{tag['Value']}")
|
|
1639
|
+
for tag in bucket_tags["TagSet"]
|
|
1640
|
+
)
|
|
1641
|
+
except self.s3_client.exceptions.ClientError:
|
|
1642
|
+
logger.warning(f"No tags found for bucket={bucket_name}")
|
|
1440
1643
|
|
|
1441
|
-
#
|
|
1442
|
-
if self.
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1644
|
+
# Get object tags
|
|
1645
|
+
if self.source_config.use_s3_object_tags:
|
|
1646
|
+
key_prefix = s3_util.get_key_prefix(location)
|
|
1647
|
+
try:
|
|
1648
|
+
object_tagging = self.s3_client.get_object_tagging(
|
|
1649
|
+
Bucket=bucket_name, Key=key_prefix
|
|
1650
|
+
)
|
|
1651
|
+
if object_tagging["TagSet"]:
|
|
1652
|
+
tags_to_add.extend(
|
|
1653
|
+
make_tag_urn(f"{tag['Key']}:{tag['Value']}")
|
|
1654
|
+
for tag in object_tagging["TagSet"]
|
|
1655
|
+
)
|
|
1656
|
+
else:
|
|
1657
|
+
logger.warning(
|
|
1658
|
+
f"No tags found for bucket={bucket_name} key={key_prefix}"
|
|
1659
|
+
)
|
|
1660
|
+
except Exception as e:
|
|
1661
|
+
logger.warning(f"Failed to get object tags: {e}")
|
|
1447
1662
|
|
|
1448
|
-
if
|
|
1449
|
-
|
|
1450
|
-
or self.source_config.use_s3_object_tags
|
|
1451
|
-
):
|
|
1452
|
-
s3_tags = get_s3_tags()
|
|
1453
|
-
if s3_tags is not None:
|
|
1454
|
-
dataset_snapshot.aspects.append(s3_tags)
|
|
1663
|
+
if not tags_to_add:
|
|
1664
|
+
return None
|
|
1455
1665
|
|
|
1456
|
-
|
|
1457
|
-
|
|
1666
|
+
# Merge with existing tags if connected to DataHub API
|
|
1667
|
+
if self.ctx.graph:
|
|
1668
|
+
logger.debug("Connected to DatahubApi, grabbing current tags to maintain.")
|
|
1669
|
+
current_tags: Optional[GlobalTagsClass] = self.ctx.graph.get_aspect(
|
|
1670
|
+
entity_urn=dataset_urn, aspect_type=GlobalTagsClass
|
|
1671
|
+
)
|
|
1672
|
+
if current_tags:
|
|
1673
|
+
tags_to_add.extend(current_tag.tag for current_tag in current_tags.tags)
|
|
1674
|
+
else:
|
|
1675
|
+
logger.warning(
|
|
1676
|
+
"Could not connect to DatahubApi. No current tags to maintain"
|
|
1677
|
+
)
|
|
1678
|
+
|
|
1679
|
+
# Remove duplicates and create tags
|
|
1680
|
+
unique_tags = sorted(set(tags_to_add))
|
|
1681
|
+
return GlobalTagsClass(tags=[TagAssociationClass(tag) for tag in unique_tags])
|
|
1682
|
+
|
|
1683
|
+
def _get_lake_formation_tags(
|
|
1684
|
+
self, tags: List[LakeFormationTag]
|
|
1685
|
+
) -> Optional[GlobalTagsClass]:
|
|
1686
|
+
"""Extract Lake Formation tags if enabled."""
|
|
1687
|
+
tag_urns: List[str] = []
|
|
1688
|
+
for tag in tags:
|
|
1689
|
+
try:
|
|
1690
|
+
tag_urns.append(tag.to_datahub_tag_urn().urn())
|
|
1691
|
+
except InvalidUrnError as e:
|
|
1692
|
+
logger.warning(
|
|
1693
|
+
f"Invalid Lake Formation tag URN for {tag}: {e}", exc_info=True
|
|
1694
|
+
)
|
|
1695
|
+
continue # Skip invalid tags
|
|
1696
|
+
|
|
1697
|
+
tag_urns.sort() # Sort to maintain consistent order
|
|
1698
|
+
return (
|
|
1699
|
+
GlobalTagsClass(tags=[TagAssociationClass(tag_urn) for tag_urn in tag_urns])
|
|
1700
|
+
if tag_urns
|
|
1701
|
+
else None
|
|
1702
|
+
)
|
|
1458
1703
|
|
|
1459
1704
|
def get_report(self):
|
|
1460
1705
|
return self.report
|