acryl-datahub 1.1.1rc3__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2559 -2532
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +226 -190
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +12 -16
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/response_helper.py +86 -1
- datahub/emitter/rest_emitter.py +71 -13
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/config.py +11 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +187 -35
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/openapi.py +12 -0
- datahub/ingestion/source/openapi_parser.py +56 -37
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +43 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +33 -8
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1446 -559
- datahub/metadata/_urns/urn_defs.py +1721 -1553
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +18055 -17802
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/sdk/__init__.py +6 -0
- datahub/sdk/_all_entities.py +11 -0
- datahub/sdk/_shared.py +118 -1
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +309 -0
- datahub/sdk/datajob.py +367 -0
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +90 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc3.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
3
|
|
|
3
4
|
# Add imports for source customization
|
|
@@ -236,42 +237,76 @@ class ABSObjectStore(ObjectStoreInterface):
|
|
|
236
237
|
"""Implementation of ObjectStoreInterface for Azure Blob Storage."""
|
|
237
238
|
|
|
238
239
|
PREFIX = "abfss://"
|
|
240
|
+
HTTPS_REGEX = re.compile(r"(https?://[a-z0-9]{3,24}\.blob\.core\.windows\.net/)")
|
|
239
241
|
|
|
240
242
|
@classmethod
|
|
241
243
|
def is_uri(cls, uri: str) -> bool:
|
|
242
|
-
return uri.startswith(cls.PREFIX)
|
|
244
|
+
return uri.startswith(cls.PREFIX) or bool(cls.HTTPS_REGEX.match(uri))
|
|
243
245
|
|
|
244
246
|
@classmethod
|
|
245
247
|
def get_prefix(cls, uri: str) -> Optional[str]:
|
|
246
248
|
if uri.startswith(cls.PREFIX):
|
|
247
249
|
return cls.PREFIX
|
|
250
|
+
|
|
251
|
+
# Check for HTTPS format
|
|
252
|
+
match = cls.HTTPS_REGEX.match(uri)
|
|
253
|
+
if match:
|
|
254
|
+
return match.group(1)
|
|
255
|
+
|
|
248
256
|
return None
|
|
249
257
|
|
|
250
258
|
@classmethod
|
|
251
259
|
def strip_prefix(cls, uri: str) -> str:
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
260
|
+
if uri.startswith(cls.PREFIX):
|
|
261
|
+
return uri[len(cls.PREFIX) :]
|
|
262
|
+
|
|
263
|
+
# Handle HTTPS format
|
|
264
|
+
match = cls.HTTPS_REGEX.match(uri)
|
|
265
|
+
if match:
|
|
266
|
+
return uri[len(match.group(1)) :]
|
|
267
|
+
|
|
268
|
+
raise ValueError(
|
|
269
|
+
f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
|
|
270
|
+
)
|
|
256
271
|
|
|
257
272
|
@classmethod
|
|
258
273
|
def get_bucket_name(cls, uri: str) -> str:
|
|
259
274
|
if not cls.is_uri(uri):
|
|
260
|
-
raise ValueError(
|
|
261
|
-
|
|
275
|
+
raise ValueError(
|
|
276
|
+
f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
if uri.startswith(cls.PREFIX):
|
|
280
|
+
# abfss://container@account.dfs.core.windows.net/path
|
|
281
|
+
return cls.strip_prefix(uri).split("@")[0]
|
|
282
|
+
else:
|
|
283
|
+
# https://account.blob.core.windows.net/container/path
|
|
284
|
+
return cls.strip_prefix(uri).split("/")[0]
|
|
262
285
|
|
|
263
286
|
@classmethod
|
|
264
287
|
def get_object_key(cls, uri: str) -> str:
|
|
265
288
|
if not cls.is_uri(uri):
|
|
266
|
-
raise ValueError(
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
289
|
+
raise ValueError(
|
|
290
|
+
f"Not an ABS URI. Must start with prefix: {cls.PREFIX} or match Azure Blob Storage HTTPS pattern"
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
if uri.startswith(cls.PREFIX):
|
|
294
|
+
# abfss://container@account.dfs.core.windows.net/path
|
|
295
|
+
parts = cls.strip_prefix(uri).split("@", 1)
|
|
296
|
+
if len(parts) < 2:
|
|
297
|
+
return ""
|
|
298
|
+
account_path = parts[1]
|
|
299
|
+
path_parts = account_path.split("/", 1)
|
|
300
|
+
if len(path_parts) < 2:
|
|
301
|
+
return ""
|
|
302
|
+
return path_parts[1]
|
|
303
|
+
else:
|
|
304
|
+
# https://account.blob.core.windows.net/container/path
|
|
305
|
+
stripped = cls.strip_prefix(uri)
|
|
306
|
+
parts = stripped.split("/", 1)
|
|
307
|
+
if len(parts) < 2:
|
|
308
|
+
return ""
|
|
309
|
+
return parts[1]
|
|
275
310
|
|
|
276
311
|
|
|
277
312
|
# Registry of all object store implementations
|
|
@@ -331,6 +366,12 @@ def get_object_store_bucket_name(uri: str) -> str:
|
|
|
331
366
|
return uri[prefix_length:].split("/")[0]
|
|
332
367
|
elif uri.startswith(ABSObjectStore.PREFIX):
|
|
333
368
|
return uri[len(ABSObjectStore.PREFIX) :].split("@")[0]
|
|
369
|
+
elif ABSObjectStore.HTTPS_REGEX.match(uri):
|
|
370
|
+
# Handle HTTPS Azure Blob Storage URLs
|
|
371
|
+
match = ABSObjectStore.HTTPS_REGEX.match(uri)
|
|
372
|
+
if match:
|
|
373
|
+
stripped = uri[len(match.group(1)) :]
|
|
374
|
+
return stripped.split("/")[0]
|
|
334
375
|
|
|
335
376
|
raise ValueError(f"Unsupported URI format: {uri}")
|
|
336
377
|
|
|
@@ -470,18 +511,25 @@ class ObjectStoreSourceAdapter:
|
|
|
470
511
|
if not ABSObjectStore.is_uri(table_data.table_path):
|
|
471
512
|
return None
|
|
472
513
|
|
|
473
|
-
# Parse the ABS URI
|
|
474
514
|
try:
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
515
|
+
if table_data.table_path.startswith("abfss://"):
|
|
516
|
+
# URI format: abfss://container@account.dfs.core.windows.net/path
|
|
517
|
+
path_without_prefix = ABSObjectStore.strip_prefix(table_data.table_path)
|
|
518
|
+
parts = path_without_prefix.split("@", 1)
|
|
519
|
+
if len(parts) < 2:
|
|
520
|
+
return None
|
|
521
|
+
|
|
522
|
+
container_name = parts[0]
|
|
523
|
+
account_parts = parts[1].split("/", 1)
|
|
524
|
+
account_domain = account_parts[0]
|
|
525
|
+
account_name = account_domain.split(".")[0]
|
|
526
|
+
else:
|
|
527
|
+
# Handle HTTPS format: https://account.blob.core.windows.net/container/path
|
|
528
|
+
container_name = ABSObjectStore.get_bucket_name(table_data.table_path)
|
|
529
|
+
if "blob.core.windows.net" in table_data.table_path:
|
|
530
|
+
account_name = table_data.table_path.split("//")[1].split(".")[0]
|
|
531
|
+
else:
|
|
532
|
+
return None
|
|
485
533
|
|
|
486
534
|
# Construct Azure portal URL
|
|
487
535
|
return f"https://portal.azure.com/#blade/Microsoft_Azure_Storage/ContainerMenuBlade/overview/storageAccountId/{account_name}/containerName/{container_name}"
|
|
@@ -519,6 +567,13 @@ class ObjectStoreSourceAdapter:
|
|
|
519
567
|
"get_external_url",
|
|
520
568
|
lambda table_data: self.get_gcs_external_url(table_data),
|
|
521
569
|
)
|
|
570
|
+
# Fix URI mismatch issue in pattern matching
|
|
571
|
+
self.register_customization(
|
|
572
|
+
"_normalize_uri_for_pattern_matching",
|
|
573
|
+
self._normalize_gcs_uri_for_pattern_matching,
|
|
574
|
+
)
|
|
575
|
+
# Fix URI handling in schema extraction - override strip_s3_prefix for GCS
|
|
576
|
+
self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
|
|
522
577
|
elif platform == "s3":
|
|
523
578
|
self.register_customization("is_s3_platform", lambda: True)
|
|
524
579
|
self.register_customization("create_s3_path", self.create_s3_path)
|
|
@@ -612,6 +667,39 @@ class ObjectStoreSourceAdapter:
|
|
|
612
667
|
return self.get_abs_external_url(table_data)
|
|
613
668
|
return None
|
|
614
669
|
|
|
670
|
+
def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
|
|
671
|
+
"""
|
|
672
|
+
Normalize GCS URI for pattern matching.
|
|
673
|
+
|
|
674
|
+
This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
|
|
675
|
+
fixing the URI mismatch issue in GCS ingestion.
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
uri: The URI to normalize
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
The normalized URI for pattern matching
|
|
682
|
+
"""
|
|
683
|
+
if uri.startswith("gs://"):
|
|
684
|
+
return uri.replace("gs://", "s3://", 1)
|
|
685
|
+
return uri
|
|
686
|
+
|
|
687
|
+
def _strip_gcs_prefix(self, uri: str) -> str:
|
|
688
|
+
"""
|
|
689
|
+
Strip GCS prefix from URI.
|
|
690
|
+
|
|
691
|
+
This method removes the gs:// prefix from GCS URIs for path processing.
|
|
692
|
+
|
|
693
|
+
Args:
|
|
694
|
+
uri: The URI to strip the prefix from
|
|
695
|
+
|
|
696
|
+
Returns:
|
|
697
|
+
The URI without the gs:// prefix
|
|
698
|
+
"""
|
|
699
|
+
if uri.startswith("gs://"):
|
|
700
|
+
return uri[5:] # Remove "gs://" prefix
|
|
701
|
+
return uri
|
|
702
|
+
|
|
615
703
|
|
|
616
704
|
# Factory function to create an adapter for a specific platform
|
|
617
705
|
def create_object_store_adapter(
|
|
@@ -166,7 +166,6 @@ class PathSpec(ConfigModel):
|
|
|
166
166
|
return False
|
|
167
167
|
|
|
168
168
|
def allowed(self, path: str, ignore_ext: bool = False) -> bool:
|
|
169
|
-
logger.debug(f"Checking file to inclusion: {path}")
|
|
170
169
|
if self.is_path_hidden(path) and not self.include_hidden_folders:
|
|
171
170
|
return False
|
|
172
171
|
|
|
@@ -174,19 +173,17 @@ class PathSpec(ConfigModel):
|
|
|
174
173
|
self.glob_include, flags=pathlib.GLOBSTAR
|
|
175
174
|
):
|
|
176
175
|
return False
|
|
177
|
-
|
|
176
|
+
|
|
178
177
|
if self.exclude:
|
|
179
178
|
for exclude_path in self.exclude:
|
|
180
179
|
if pathlib.PurePath(path).globmatch(
|
|
181
180
|
exclude_path, flags=pathlib.GLOBSTAR
|
|
182
181
|
):
|
|
183
182
|
return False
|
|
184
|
-
logger.debug(f"{path} is not excluded")
|
|
185
183
|
|
|
186
184
|
table_name, _ = self.extract_table_name_and_path(path)
|
|
187
185
|
if not self.tables_filter_pattern.allowed(table_name):
|
|
188
186
|
return False
|
|
189
|
-
logger.debug(f"{path} is passed table name check")
|
|
190
187
|
|
|
191
188
|
ext = os.path.splitext(path)[1].strip(".")
|
|
192
189
|
|
|
@@ -196,8 +193,6 @@ class PathSpec(ConfigModel):
|
|
|
196
193
|
):
|
|
197
194
|
return False
|
|
198
195
|
|
|
199
|
-
logger.debug(f"{path} had selected extension {ext}")
|
|
200
|
-
logger.debug(f"{path} allowed for dataset creation")
|
|
201
196
|
return True
|
|
202
197
|
|
|
203
198
|
def dir_allowed(self, path: str) -> bool:
|
|
@@ -219,10 +214,8 @@ class PathSpec(ConfigModel):
|
|
|
219
214
|
for _ in range(slash_to_remove_from_glob):
|
|
220
215
|
glob_include = glob_include.rsplit("/", 1)[0]
|
|
221
216
|
|
|
222
|
-
logger.debug(f"Checking dir to inclusion: {path}")
|
|
223
217
|
if not pathlib.PurePath(path).globmatch(glob_include, flags=pathlib.GLOBSTAR):
|
|
224
218
|
return False
|
|
225
|
-
logger.debug(f"{path} matched include ")
|
|
226
219
|
if self.exclude:
|
|
227
220
|
for exclude_path in self.exclude:
|
|
228
221
|
if pathlib.PurePath(path.rstrip("/")).globmatch(
|
|
@@ -236,7 +229,7 @@ class PathSpec(ConfigModel):
|
|
|
236
229
|
)
|
|
237
230
|
if not self.tables_filter_pattern.allowed(table_name):
|
|
238
231
|
return False
|
|
239
|
-
logger.debug(f"{path} is passed table name check")
|
|
232
|
+
# logger.debug(f"{path} is passed table name check")
|
|
240
233
|
|
|
241
234
|
return True
|
|
242
235
|
|
|
@@ -246,10 +239,10 @@ class PathSpec(ConfigModel):
|
|
|
246
239
|
if parsable_include.endswith("/{table}/**"):
|
|
247
240
|
# Remove the last two characters to make it parsable if it ends with {table}/** which marks autodetect partition
|
|
248
241
|
parsable_include = parsable_include[:-2]
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
242
|
+
|
|
243
|
+
# Replace all * with {folder[i]} to make it parsable
|
|
244
|
+
for i in range(parsable_include.count("*")):
|
|
245
|
+
parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
|
|
253
246
|
return parsable_include
|
|
254
247
|
|
|
255
248
|
def get_named_vars(self, path: str) -> Union[None, parse.Result, parse.Match]:
|
|
@@ -330,8 +323,6 @@ class PathSpec(ConfigModel):
|
|
|
330
323
|
if "{table}" in values["include"]:
|
|
331
324
|
v = "{table}"
|
|
332
325
|
else:
|
|
333
|
-
logger.debug(f"include fields: {compiled_include.named_fields}")
|
|
334
|
-
logger.debug(f"table_name fields: {parse.compile(v).named_fields}")
|
|
335
326
|
if not all(
|
|
336
327
|
x in compiled_include.named_fields
|
|
337
328
|
for x in parse.compile(v).named_fields
|
|
@@ -356,9 +347,7 @@ class PathSpec(ConfigModel):
|
|
|
356
347
|
@cached_property
|
|
357
348
|
def compiled_include(self):
|
|
358
349
|
parsable_include = PathSpec.get_parsable_include(self.include)
|
|
359
|
-
logger.debug(f"parsable_include: {parsable_include}")
|
|
360
350
|
compiled_include = parse.compile(parsable_include)
|
|
361
|
-
logger.debug(f"Setting compiled_include: {compiled_include}")
|
|
362
351
|
return compiled_include
|
|
363
352
|
|
|
364
353
|
@cached_property
|
|
@@ -366,9 +355,8 @@ class PathSpec(ConfigModel):
|
|
|
366
355
|
parsable_folder_include = PathSpec.get_parsable_include(self.include).rsplit(
|
|
367
356
|
"/", 1
|
|
368
357
|
)[0]
|
|
369
|
-
logger.debug(f"parsable_folder_include: {parsable_folder_include}")
|
|
370
358
|
compiled_folder_include = parse.compile(parsable_folder_include)
|
|
371
|
-
|
|
359
|
+
|
|
372
360
|
return compiled_folder_include
|
|
373
361
|
|
|
374
362
|
@cached_property
|
|
@@ -376,7 +364,8 @@ class PathSpec(ConfigModel):
|
|
|
376
364
|
# Regular expression to find all substrings enclosed in {}
|
|
377
365
|
pattern = r"\{(.*?)\}"
|
|
378
366
|
# Find all matches
|
|
379
|
-
|
|
367
|
+
split_parts = self.include.split("{table}/")
|
|
368
|
+
matches = re.findall(pattern, split_parts[1]) if len(split_parts) > 1 else []
|
|
380
369
|
return matches
|
|
381
370
|
|
|
382
371
|
def get_partition_from_path(self, path: str) -> Optional[List[Tuple[str, str]]]:
|
|
@@ -563,7 +552,7 @@ class PathSpec(ConfigModel):
|
|
|
563
552
|
f"{{{template_key}}}", var[key]
|
|
564
553
|
)
|
|
565
554
|
else:
|
|
566
|
-
partition_format.replace(f"{{{var_key}}}", var)
|
|
555
|
+
partition_format = partition_format.replace(f"{{{var_key}}}", var)
|
|
567
556
|
return datetime.datetime.strptime(partition_format, datetime_format).replace(
|
|
568
557
|
tzinfo=datetime.timezone.utc
|
|
569
558
|
)
|
|
@@ -118,6 +118,17 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
118
118
|
"Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
|
|
119
119
|
)
|
|
120
120
|
|
|
121
|
+
structured_properties_template_cache_invalidation_interval: int = Field(
|
|
122
|
+
hidden_from_docs=True,
|
|
123
|
+
default=60,
|
|
124
|
+
description="Interval in seconds to invalidate the structured properties template cache.",
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
query_timeout: Optional[int] = Field(
|
|
128
|
+
default=None,
|
|
129
|
+
description="Timeout for each query in seconds. ",
|
|
130
|
+
)
|
|
131
|
+
|
|
121
132
|
@root_validator(skip_on_failure=True)
|
|
122
133
|
def check_ingesting_data(cls, values):
|
|
123
134
|
if (
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import contextlib
|
|
2
1
|
import json
|
|
3
2
|
import logging
|
|
3
|
+
import time
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
from typing import Any, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
|
|
6
6
|
|
|
7
|
-
from sqlalchemy import create_engine
|
|
7
|
+
from sqlalchemy import create_engine, text
|
|
8
8
|
|
|
9
9
|
from datahub.emitter.aspect import ASPECT_MAP
|
|
10
10
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
@@ -12,13 +12,14 @@ from datahub.emitter.serialization_helper import post_json_transform
|
|
|
12
12
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
13
13
|
from datahub.ingestion.source.datahub.report import DataHubSourceReport
|
|
14
14
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
|
|
15
|
-
from datahub.metadata.schema_classes import
|
|
15
|
+
from datahub.metadata.schema_classes import SystemMetadataClass
|
|
16
16
|
from datahub.utilities.lossy_collections import LossyDict, LossyList
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
20
|
# Should work for at least mysql, mariadb, postgres
|
|
21
21
|
DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
|
|
22
|
+
DATE_FORMAT = "%Y-%m-%d"
|
|
22
23
|
|
|
23
24
|
ROW = TypeVar("ROW", bound=Dict[str, Any])
|
|
24
25
|
|
|
@@ -85,6 +86,9 @@ class DataHubDatabaseReader:
|
|
|
85
86
|
**connection_config.options,
|
|
86
87
|
)
|
|
87
88
|
|
|
89
|
+
# Cache for available dates to avoid redundant queries
|
|
90
|
+
self.available_dates_cache: Optional[List[datetime]] = None
|
|
91
|
+
|
|
88
92
|
@property
|
|
89
93
|
def soft_deleted_urns_query(self) -> str:
|
|
90
94
|
return f"""
|
|
@@ -100,14 +104,12 @@ class DataHubDatabaseReader:
|
|
|
100
104
|
ORDER BY mav.urn
|
|
101
105
|
"""
|
|
102
106
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
+
def query(self, set_structured_properties_filter: bool) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Main query that gets data for specified date range with appropriate filters.
|
|
110
|
+
"""
|
|
111
|
+
structured_prop_filter = f" AND urn {'' if set_structured_properties_filter else 'NOT'} like 'urn:li:structuredProperty:%%'"
|
|
107
112
|
|
|
108
|
-
# Ensures stable order, chronological per (urn, aspect)
|
|
109
|
-
# Relies on createdon order to reflect version order
|
|
110
|
-
# Ordering of entries with the same createdon is handled by VersionOrderer
|
|
111
113
|
return f"""
|
|
112
114
|
SELECT *
|
|
113
115
|
FROM (
|
|
@@ -132,6 +134,7 @@ class DataHubDatabaseReader:
|
|
|
132
134
|
{"" if self.config.include_all_versions else "AND mav.version = 0"}
|
|
133
135
|
{"" if not self.config.exclude_aspects else "AND mav.aspect NOT IN %(exclude_aspects)s"}
|
|
134
136
|
AND mav.createdon >= %(since_createdon)s
|
|
137
|
+
AND mav.createdon < %(end_createdon)s
|
|
135
138
|
ORDER BY
|
|
136
139
|
createdon,
|
|
137
140
|
urn,
|
|
@@ -139,50 +142,194 @@ class DataHubDatabaseReader:
|
|
|
139
142
|
version
|
|
140
143
|
) as t
|
|
141
144
|
WHERE 1=1
|
|
142
|
-
{"" if self.config.include_soft_deleted_entities else "AND (removed = false or removed is NULL)"}
|
|
145
|
+
{"" if self.config.include_soft_deleted_entities else " AND (removed = false or removed is NULL)"}
|
|
146
|
+
{structured_prop_filter}
|
|
143
147
|
ORDER BY
|
|
144
148
|
createdon,
|
|
145
149
|
urn,
|
|
146
150
|
aspect,
|
|
147
151
|
version
|
|
152
|
+
LIMIT %(limit)s
|
|
153
|
+
OFFSET %(offset)s
|
|
148
154
|
"""
|
|
149
155
|
|
|
156
|
+
def execute_with_params(
|
|
157
|
+
self, query: str, params: Dict[str, Any]
|
|
158
|
+
) -> List[Dict[str, Any]]:
|
|
159
|
+
"""Execute query with proper parameter binding that works with your database"""
|
|
160
|
+
with self.engine.connect() as conn:
|
|
161
|
+
result = conn.execute(query, params or {})
|
|
162
|
+
return [dict(row) for row in result.fetchall()]
|
|
163
|
+
|
|
150
164
|
def execute_server_cursor(
|
|
151
165
|
self, query: str, params: Dict[str, Any]
|
|
152
166
|
) -> Iterable[Dict[str, Any]]:
|
|
167
|
+
"""Execute a query with server-side cursor"""
|
|
153
168
|
with self.engine.connect() as conn:
|
|
154
169
|
if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
|
|
155
170
|
with (
|
|
156
171
|
conn.begin()
|
|
157
172
|
): # Transaction required for PostgreSQL server-side cursor
|
|
158
|
-
#
|
|
159
|
-
|
|
173
|
+
# Set query timeout at the connection level
|
|
174
|
+
if self.config.query_timeout:
|
|
175
|
+
if self.engine.dialect.name == "postgresql":
|
|
176
|
+
conn.execute(
|
|
177
|
+
text(
|
|
178
|
+
f"SET statement_timeout = {self.config.query_timeout * 1000}"
|
|
179
|
+
)
|
|
180
|
+
) # milliseconds
|
|
181
|
+
elif self.engine.dialect.name in ["mysql", "mariadb"]:
|
|
182
|
+
conn.execute(
|
|
183
|
+
text(
|
|
184
|
+
f"SET max_execution_time = {self.config.query_timeout * 1000}"
|
|
185
|
+
)
|
|
186
|
+
) # milliseconds
|
|
187
|
+
|
|
188
|
+
# Stream results with batch size
|
|
160
189
|
conn = conn.execution_options(
|
|
161
190
|
stream_results=True,
|
|
162
191
|
yield_per=self.config.database_query_batch_size,
|
|
163
192
|
)
|
|
193
|
+
|
|
194
|
+
# Execute query - using native parameterization without text()
|
|
195
|
+
# to maintain compatibility with your original code
|
|
164
196
|
result = conn.execute(query, params)
|
|
165
197
|
for row in result:
|
|
166
198
|
yield dict(row)
|
|
199
|
+
|
|
200
|
+
return # Success, exit the retry loop
|
|
167
201
|
else:
|
|
168
202
|
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
|
|
169
203
|
|
|
170
204
|
def _get_rows(
|
|
171
|
-
self,
|
|
205
|
+
self,
|
|
206
|
+
start_date: datetime,
|
|
207
|
+
end_date: datetime,
|
|
208
|
+
set_structured_properties_filter: bool,
|
|
209
|
+
limit: int,
|
|
172
210
|
) -> Iterable[Dict[str, Any]]:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
"since_createdon": from_createdon.strftime(DATETIME_FORMAT),
|
|
176
|
-
}
|
|
177
|
-
yield from self.execute_server_cursor(self.query, params)
|
|
211
|
+
"""
|
|
212
|
+
Retrieves data rows within a specified date range using pagination.
|
|
178
213
|
|
|
179
|
-
|
|
214
|
+
Implements a hybrid pagination strategy that switches between time-based and
|
|
215
|
+
offset-based approaches depending on the returned data. Uses server-side
|
|
216
|
+
cursors for efficient memory usage.
|
|
217
|
+
|
|
218
|
+
Note: May return duplicate rows across batch boundaries when multiple rows
|
|
219
|
+
share the same 'createdon' timestamp. This is expected behavior when
|
|
220
|
+
transitioning between pagination methods.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
start_date: Beginning of date range (inclusive)
|
|
224
|
+
end_date: End of date range (exclusive)
|
|
225
|
+
set_structured_properties_filter: Whether to apply structured filtering
|
|
226
|
+
limit: Maximum rows to fetch per query
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
An iterable of database rows as dictionaries
|
|
230
|
+
"""
|
|
231
|
+
offset = 0
|
|
232
|
+
last_createdon = None
|
|
233
|
+
first_iteration = True
|
|
234
|
+
|
|
235
|
+
while True:
|
|
236
|
+
try:
|
|
237
|
+
# Set up query and parameters - using named parameters
|
|
238
|
+
query = self.query(set_structured_properties_filter)
|
|
239
|
+
params: Dict[str, Any] = {
|
|
240
|
+
"since_createdon": start_date.strftime(DATETIME_FORMAT),
|
|
241
|
+
"end_createdon": end_date.strftime(DATETIME_FORMAT),
|
|
242
|
+
"limit": limit,
|
|
243
|
+
"offset": offset,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
# Add exclude_aspects if needed
|
|
247
|
+
if (
|
|
248
|
+
hasattr(self.config, "exclude_aspects")
|
|
249
|
+
and self.config.exclude_aspects
|
|
250
|
+
):
|
|
251
|
+
params["exclude_aspects"] = tuple(self.config.exclude_aspects)
|
|
252
|
+
|
|
253
|
+
logger.info(
|
|
254
|
+
f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
|
|
255
|
+
f"with limit {limit} and offset {offset} (inclusive range)"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Execute query with server-side cursor
|
|
259
|
+
rows = self.execute_server_cursor(query, params)
|
|
260
|
+
# Process and yield rows
|
|
261
|
+
rows_processed = 0
|
|
262
|
+
for row in rows:
|
|
263
|
+
if first_iteration:
|
|
264
|
+
start_date = row.get("createdon", start_date)
|
|
265
|
+
first_iteration = False
|
|
266
|
+
|
|
267
|
+
last_createdon = row.get("createdon")
|
|
268
|
+
rows_processed += 1
|
|
269
|
+
yield row
|
|
270
|
+
|
|
271
|
+
# If we processed fewer than the limit or no last_createdon, we're done
|
|
272
|
+
if rows_processed < limit or not last_createdon:
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
# Update parameters for next iteration
|
|
276
|
+
if start_date != last_createdon:
|
|
277
|
+
start_date = last_createdon
|
|
278
|
+
offset = 0
|
|
279
|
+
else:
|
|
280
|
+
offset += limit
|
|
281
|
+
|
|
282
|
+
logger.info(
|
|
283
|
+
f"Processed {rows_processed} rows for date range {start_date} to {end_date}. Continuing to next batch."
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logger.error(
|
|
288
|
+
f"Error processing date range {start_date} to {end_date}: {str(e)}"
|
|
289
|
+
)
|
|
290
|
+
# Re-raise the exception after logging
|
|
291
|
+
raise
|
|
292
|
+
|
|
293
|
+
def get_all_aspects(
|
|
180
294
|
self, from_createdon: datetime, stop_time: datetime
|
|
295
|
+
) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
|
|
296
|
+
logger.info("Fetching Structured properties aspects")
|
|
297
|
+
yield from self.get_aspects(
|
|
298
|
+
from_createdon=from_createdon,
|
|
299
|
+
stop_time=stop_time,
|
|
300
|
+
set_structured_properties_filter=True,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
logger.info(
|
|
304
|
+
f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
time.sleep(
|
|
308
|
+
self.config.structured_properties_template_cache_invalidation_interval
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
logger.info("Fetching aspects")
|
|
312
|
+
yield from self.get_aspects(
|
|
313
|
+
from_createdon=from_createdon,
|
|
314
|
+
stop_time=stop_time,
|
|
315
|
+
set_structured_properties_filter=False,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
def get_aspects(
|
|
319
|
+
self,
|
|
320
|
+
from_createdon: datetime,
|
|
321
|
+
stop_time: datetime,
|
|
322
|
+
set_structured_properties_filter: bool = False,
|
|
181
323
|
) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
|
|
182
324
|
orderer = VersionOrderer[Dict[str, Any]](
|
|
183
325
|
enabled=self.config.include_all_versions
|
|
184
326
|
)
|
|
185
|
-
rows = self._get_rows(
|
|
327
|
+
rows = self._get_rows(
|
|
328
|
+
start_date=from_createdon,
|
|
329
|
+
end_date=stop_time,
|
|
330
|
+
set_structured_properties_filter=set_structured_properties_filter,
|
|
331
|
+
limit=self.config.database_query_batch_size,
|
|
332
|
+
)
|
|
186
333
|
for row in orderer(rows):
|
|
187
334
|
mcp = self._parse_row(row)
|
|
188
335
|
if mcp:
|
|
@@ -190,23 +337,29 @@ class DataHubDatabaseReader:
|
|
|
190
337
|
|
|
191
338
|
def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
|
|
192
339
|
"""
|
|
193
|
-
Fetches all soft-deleted entities from the database.
|
|
340
|
+
Fetches all soft-deleted entities from the database using pagination.
|
|
194
341
|
|
|
195
342
|
Yields:
|
|
196
343
|
Row objects containing URNs of soft-deleted entities
|
|
197
344
|
"""
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
logger.debug("
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
345
|
+
try:
|
|
346
|
+
params: Dict = {}
|
|
347
|
+
|
|
348
|
+
logger.debug("Fetching soft-deleted URNs")
|
|
349
|
+
|
|
350
|
+
# Use server-side cursor implementation
|
|
351
|
+
rows = self.execute_server_cursor(self.soft_deleted_urns_query, params)
|
|
352
|
+
processed_rows = 0
|
|
353
|
+
# Process and yield rows
|
|
354
|
+
for row in rows:
|
|
355
|
+
processed_rows += 1
|
|
356
|
+
yield row
|
|
357
|
+
|
|
358
|
+
logger.debug(f"Fetched batch of {processed_rows} soft-deleted URNs")
|
|
359
|
+
|
|
360
|
+
except Exception:
|
|
361
|
+
logger.exception("Error fetching soft-deleted row", exc_info=True)
|
|
362
|
+
raise
|
|
210
363
|
|
|
211
364
|
def _parse_row(
|
|
212
365
|
self, row: Dict[str, Any]
|
|
@@ -221,7 +374,6 @@ class DataHubDatabaseReader:
|
|
|
221
374
|
entityUrn=row["urn"],
|
|
222
375
|
aspect=ASPECT_MAP[row["aspect"]].from_obj(json_aspect),
|
|
223
376
|
systemMetadata=system_metadata,
|
|
224
|
-
changeType=ChangeTypeClass.UPSERT,
|
|
225
377
|
)
|
|
226
378
|
except Exception as e:
|
|
227
379
|
logger.warning(
|
|
@@ -117,7 +117,7 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
117
117
|
) -> Iterable[MetadataWorkUnit]:
|
|
118
118
|
logger.info(f"Fetching database aspects starting from {from_createdon}")
|
|
119
119
|
progress = ProgressTimer(report_every=timedelta(seconds=60))
|
|
120
|
-
mcps = reader.
|
|
120
|
+
mcps = reader.get_all_aspects(from_createdon, self.report.stop_time)
|
|
121
121
|
for i, (mcp, createdon) in enumerate(mcps):
|
|
122
122
|
if not self.urn_pattern.allowed(str(mcp.entityUrn)):
|
|
123
123
|
continue
|