acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +31 -7
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +8 -5
- datahub/ingestion/source/dbt/dbt_core.py +11 -9
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +6 -5
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +16 -18
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +19 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +258 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +30 -6
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +220 -126
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -357,6 +357,11 @@ class DBTCommonConfig(
|
|
|
357
357
|
default=True,
|
|
358
358
|
description="When enabled, includes the compiled code in the emitted metadata.",
|
|
359
359
|
)
|
|
360
|
+
include_database_name: bool = Field(
|
|
361
|
+
default=True,
|
|
362
|
+
description="Whether to add database name to the table urn. "
|
|
363
|
+
"Set to False to skip it for engines like AWS Athena where it's not required.",
|
|
364
|
+
)
|
|
360
365
|
|
|
361
366
|
@validator("target_platform")
|
|
362
367
|
def validate_target_platform_value(cls, target_platform: str) -> str:
|
|
@@ -1028,7 +1033,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1028
1033
|
cll_nodes.add(dbt_name)
|
|
1029
1034
|
schema_nodes.add(dbt_name)
|
|
1030
1035
|
|
|
1031
|
-
for dbt_name in all_nodes_map
|
|
1036
|
+
for dbt_name in all_nodes_map:
|
|
1032
1037
|
if self._is_allowed_node(dbt_name):
|
|
1033
1038
|
add_node_to_cll_list(dbt_name)
|
|
1034
1039
|
|
|
@@ -1769,10 +1774,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
1769
1774
|
logger.debug(
|
|
1770
1775
|
f"Owner after applying owner extraction pattern:'{self.config.owner_extraction_pattern}' is '{owner}'."
|
|
1771
1776
|
)
|
|
1772
|
-
if isinstance(owner, list)
|
|
1773
|
-
|
|
1774
|
-
else:
|
|
1775
|
-
owners = [owner]
|
|
1777
|
+
owners = owner if isinstance(owner, list) else [owner]
|
|
1778
|
+
|
|
1776
1779
|
for owner in owners:
|
|
1777
1780
|
if self.config.strip_user_ids_from_email:
|
|
1778
1781
|
owner = owner.split("@")[0]
|
|
@@ -167,6 +167,7 @@ def extract_dbt_entities(
|
|
|
167
167
|
use_identifiers: bool,
|
|
168
168
|
tag_prefix: str,
|
|
169
169
|
only_include_if_in_catalog: bool,
|
|
170
|
+
include_database_name: bool,
|
|
170
171
|
report: DBTSourceReport,
|
|
171
172
|
) -> List[DBTNode]:
|
|
172
173
|
sources_by_id = {x["unique_id"]: x for x in sources_results}
|
|
@@ -267,7 +268,7 @@ def extract_dbt_entities(
|
|
|
267
268
|
dbt_name=key,
|
|
268
269
|
dbt_adapter=manifest_adapter,
|
|
269
270
|
dbt_package_name=manifest_node.get("package_name"),
|
|
270
|
-
database=manifest_node["database"],
|
|
271
|
+
database=manifest_node["database"] if include_database_name else None,
|
|
271
272
|
schema=manifest_node["schema"],
|
|
272
273
|
name=name,
|
|
273
274
|
alias=manifest_node.get("alias"),
|
|
@@ -543,14 +544,15 @@ class DBTCoreSource(DBTSourceBase, TestableSource):
|
|
|
543
544
|
all_catalog_entities = {**catalog_nodes, **catalog_sources}
|
|
544
545
|
|
|
545
546
|
nodes = extract_dbt_entities(
|
|
546
|
-
all_manifest_entities,
|
|
547
|
-
all_catalog_entities,
|
|
548
|
-
sources_results,
|
|
549
|
-
manifest_adapter,
|
|
550
|
-
self.config.use_identifiers,
|
|
551
|
-
self.config.tag_prefix,
|
|
552
|
-
self.config.only_include_if_in_catalog,
|
|
553
|
-
self.
|
|
547
|
+
all_manifest_entities=all_manifest_entities,
|
|
548
|
+
all_catalog_entities=all_catalog_entities,
|
|
549
|
+
sources_results=sources_results,
|
|
550
|
+
manifest_adapter=manifest_adapter,
|
|
551
|
+
use_identifiers=self.config.use_identifiers,
|
|
552
|
+
tag_prefix=self.config.tag_prefix,
|
|
553
|
+
only_include_if_in_catalog=self.config.only_include_if_in_catalog,
|
|
554
|
+
include_database_name=self.config.include_database_name,
|
|
555
|
+
report=self.report,
|
|
554
556
|
)
|
|
555
557
|
|
|
556
558
|
return (
|
|
@@ -57,15 +57,11 @@ def _get_name_for_relationship_test(kw_args: Dict[str, str]) -> Optional[str]:
|
|
|
57
57
|
# base assertions are violated, bail early
|
|
58
58
|
return None
|
|
59
59
|
m = re.match(r"^ref\(\'(.*)\'\)$", destination_ref)
|
|
60
|
-
if m
|
|
61
|
-
|
|
62
|
-
else:
|
|
63
|
-
destination_table = destination_ref
|
|
60
|
+
destination_table = m.group(1) if m else destination_ref
|
|
61
|
+
|
|
64
62
|
m = re.search(r"ref\(\'(.*)\'\)", source_ref)
|
|
65
|
-
if m
|
|
66
|
-
|
|
67
|
-
else:
|
|
68
|
-
source_table = source_ref
|
|
63
|
+
source_table = m.group(1) if m else source_ref
|
|
64
|
+
|
|
69
65
|
return f"{source_table}.{column_name} referential integrity to {destination_table}.{dest_field_name}"
|
|
70
66
|
|
|
71
67
|
|
|
@@ -13,6 +13,9 @@ from datahub.configuration.source_common import (
|
|
|
13
13
|
)
|
|
14
14
|
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
|
|
15
15
|
from datahub.ingestion.source.aws.s3_util import is_s3_uri
|
|
16
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
17
|
+
StatefulIngestionConfigBase,
|
|
18
|
+
)
|
|
16
19
|
|
|
17
20
|
# hide annoying debug errors from py4j
|
|
18
21
|
logging.getLogger("py4j").setLevel(logging.ERROR)
|
|
@@ -35,7 +38,11 @@ class S3(ConfigModel):
|
|
|
35
38
|
)
|
|
36
39
|
|
|
37
40
|
|
|
38
|
-
class DeltaLakeSourceConfig(
|
|
41
|
+
class DeltaLakeSourceConfig(
|
|
42
|
+
PlatformInstanceConfigMixin,
|
|
43
|
+
EnvConfigMixin,
|
|
44
|
+
StatefulIngestionConfigBase,
|
|
45
|
+
):
|
|
39
46
|
base_path: str = Field(
|
|
40
47
|
description="Path to table (s3 or local file system). If path is not a delta table path "
|
|
41
48
|
"then all subfolders will be scanned to detect and ingest delta tables."
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
from dataclasses import field as dataclass_field
|
|
3
3
|
|
|
4
|
-
from datahub.ingestion.
|
|
4
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
5
|
+
StaleEntityRemovalSourceReport,
|
|
6
|
+
)
|
|
5
7
|
from datahub.utilities.lossy_collections import LossyList
|
|
6
8
|
|
|
7
9
|
|
|
8
10
|
@dataclasses.dataclass
|
|
9
|
-
class DeltaLakeSourceReport(
|
|
11
|
+
class DeltaLakeSourceReport(StaleEntityRemovalSourceReport):
|
|
10
12
|
files_scanned = 0
|
|
11
13
|
filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
|
|
12
14
|
|
|
@@ -2,7 +2,7 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
4
|
import time
|
|
5
|
-
from typing import Dict, Iterable, List
|
|
5
|
+
from typing import Dict, Iterable, List, Optional
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
7
|
|
|
8
8
|
from deltalake import DeltaTable
|
|
@@ -21,7 +21,7 @@ from datahub.ingestion.api.decorators import (
|
|
|
21
21
|
platform_name,
|
|
22
22
|
support_status,
|
|
23
23
|
)
|
|
24
|
-
from datahub.ingestion.api.source import
|
|
24
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
25
25
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
26
26
|
from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags
|
|
27
27
|
from datahub.ingestion.source.aws.s3_util import (
|
|
@@ -36,6 +36,12 @@ from datahub.ingestion.source.delta_lake.delta_lake_utils import (
|
|
|
36
36
|
read_delta_table,
|
|
37
37
|
)
|
|
38
38
|
from datahub.ingestion.source.delta_lake.report import DeltaLakeSourceReport
|
|
39
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
40
|
+
StaleEntityRemovalHandler,
|
|
41
|
+
)
|
|
42
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
43
|
+
StatefulIngestionSourceBase,
|
|
44
|
+
)
|
|
39
45
|
from datahub.metadata.com.linkedin.pegasus2avro.common import Status
|
|
40
46
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
41
47
|
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
@@ -79,7 +85,7 @@ OPERATION_STATEMENT_TYPES = {
|
|
|
79
85
|
@config_class(DeltaLakeSourceConfig)
|
|
80
86
|
@support_status(SupportStatus.INCUBATING)
|
|
81
87
|
@capability(SourceCapability.TAGS, "Can extract S3 object/bucket tags if enabled")
|
|
82
|
-
class DeltaLakeSource(
|
|
88
|
+
class DeltaLakeSource(StatefulIngestionSourceBase):
|
|
83
89
|
"""
|
|
84
90
|
This plugin extracts:
|
|
85
91
|
- Column types and schema associated with each delta table
|
|
@@ -100,9 +106,10 @@ class DeltaLakeSource(Source):
|
|
|
100
106
|
storage_options: Dict[str, str]
|
|
101
107
|
|
|
102
108
|
def __init__(self, config: DeltaLakeSourceConfig, ctx: PipelineContext):
|
|
103
|
-
super().__init__(ctx)
|
|
109
|
+
super().__init__(config, ctx)
|
|
110
|
+
self.ctx = ctx
|
|
104
111
|
self.source_config = config
|
|
105
|
-
self.report = DeltaLakeSourceReport()
|
|
112
|
+
self.report: DeltaLakeSourceReport = DeltaLakeSourceReport()
|
|
106
113
|
if self.source_config.is_s3:
|
|
107
114
|
if (
|
|
108
115
|
self.source_config.s3 is None
|
|
@@ -331,6 +338,14 @@ class DeltaLakeSource(Source):
|
|
|
331
338
|
for folder in os.listdir(path):
|
|
332
339
|
yield os.path.join(path, folder)
|
|
333
340
|
|
|
341
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
342
|
+
return [
|
|
343
|
+
*super().get_workunit_processors(),
|
|
344
|
+
StaleEntityRemovalHandler.create(
|
|
345
|
+
self, self.source_config, self.ctx
|
|
346
|
+
).workunit_processor,
|
|
347
|
+
]
|
|
348
|
+
|
|
334
349
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
335
350
|
self.container_WU_creator = ContainerWUCreator(
|
|
336
351
|
self.source_config.platform,
|
|
@@ -271,12 +271,12 @@ class DremioAPIOperations:
|
|
|
271
271
|
self.cancel_query(job_id)
|
|
272
272
|
raise DremioAPIException(
|
|
273
273
|
f"Query execution timed out after {timeout} seconds"
|
|
274
|
-
)
|
|
274
|
+
) from None
|
|
275
275
|
except RuntimeError as e:
|
|
276
|
-
raise DremioAPIException(
|
|
276
|
+
raise DremioAPIException() from e
|
|
277
277
|
|
|
278
278
|
except requests.RequestException as e:
|
|
279
|
-
raise DremioAPIException(
|
|
279
|
+
raise DremioAPIException("Error executing query") from e
|
|
280
280
|
|
|
281
281
|
def fetch_results(self, job_id: str) -> List[Dict]:
|
|
282
282
|
"""Fetch job results with status checking"""
|
|
@@ -683,11 +683,7 @@ class DremioAPIOperations:
|
|
|
683
683
|
# Add end anchor for exact matching
|
|
684
684
|
regex_pattern = regex_pattern + "$"
|
|
685
685
|
|
|
686
|
-
for path in paths
|
|
687
|
-
if re.match(regex_pattern, path, re.IGNORECASE):
|
|
688
|
-
return True
|
|
689
|
-
|
|
690
|
-
return False
|
|
686
|
+
return any(re.match(regex_pattern, path, re.IGNORECASE) for path in paths)
|
|
691
687
|
|
|
692
688
|
def should_include_container(self, path: List[str], name: str) -> bool:
|
|
693
689
|
"""
|
|
@@ -116,10 +116,7 @@ class SchemaFieldTypeMapper:
|
|
|
116
116
|
data_type = data_type.lower()
|
|
117
117
|
type_class = cls.FIELD_TYPE_MAPPING.get(data_type, NullTypeClass)
|
|
118
118
|
|
|
119
|
-
if data_size
|
|
120
|
-
native_data_type = f"{data_type}({data_size})"
|
|
121
|
-
else:
|
|
122
|
-
native_data_type = data_type
|
|
119
|
+
native_data_type = f"{data_type}({data_size})" if data_size else data_type
|
|
123
120
|
|
|
124
121
|
try:
|
|
125
122
|
schema_field_type = SchemaFieldDataTypeClass(type=type_class())
|
|
@@ -168,8 +165,9 @@ class DremioAspects:
|
|
|
168
165
|
)
|
|
169
166
|
|
|
170
167
|
def get_container_urn(
|
|
171
|
-
self, name: Optional[str] = None, path: Optional[List[str]] =
|
|
168
|
+
self, name: Optional[str] = None, path: Optional[List[str]] = None
|
|
172
169
|
) -> str:
|
|
170
|
+
path = path or []
|
|
173
171
|
container_key = self.get_container_key(name, path)
|
|
174
172
|
return container_key.as_urn()
|
|
175
173
|
|
|
@@ -165,6 +165,10 @@ _attribute_type_to_field_type_mapping: Dict[str, Type] = {
|
|
|
165
165
|
SourceCapability.PLATFORM_INSTANCE,
|
|
166
166
|
"By default, platform_instance will use the AWS account id",
|
|
167
167
|
)
|
|
168
|
+
@capability(
|
|
169
|
+
SourceCapability.CLASSIFICATION,
|
|
170
|
+
"Optionally enabled via `classification.enabled`",
|
|
171
|
+
)
|
|
168
172
|
class DynamoDBSource(StatefulIngestionSourceBase):
|
|
169
173
|
"""
|
|
170
174
|
This plugin extracts the following:
|
|
@@ -242,8 +246,10 @@ class DynamoDBSource(StatefulIngestionSourceBase):
|
|
|
242
246
|
platform=self.platform,
|
|
243
247
|
platform_instance=platform_instance,
|
|
244
248
|
name=dataset_name,
|
|
249
|
+
env=self.config.env,
|
|
245
250
|
)
|
|
246
251
|
dataset_properties = DatasetPropertiesClass(
|
|
252
|
+
name=table_name,
|
|
247
253
|
tags=[],
|
|
248
254
|
customProperties={
|
|
249
255
|
"table.arn": table_info["TableArn"],
|
|
@@ -32,9 +32,17 @@ from datahub.ingestion.api.decorators import (
|
|
|
32
32
|
platform_name,
|
|
33
33
|
support_status,
|
|
34
34
|
)
|
|
35
|
-
from datahub.ingestion.api.source import
|
|
35
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
|
36
36
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
37
37
|
from datahub.ingestion.source.common.subtypes import DatasetSubTypes
|
|
38
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
39
|
+
StaleEntityRemovalHandler,
|
|
40
|
+
StaleEntityRemovalSourceReport,
|
|
41
|
+
)
|
|
42
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
43
|
+
StatefulIngestionConfigBase,
|
|
44
|
+
StatefulIngestionSourceBase,
|
|
45
|
+
)
|
|
38
46
|
from datahub.ingestion.source_config.operation_config import (
|
|
39
47
|
OperationConfig,
|
|
40
48
|
is_profiling_enabled,
|
|
@@ -188,7 +196,7 @@ class ElasticToSchemaFieldConverter:
|
|
|
188
196
|
|
|
189
197
|
|
|
190
198
|
@dataclass
|
|
191
|
-
class ElasticsearchSourceReport(
|
|
199
|
+
class ElasticsearchSourceReport(StaleEntityRemovalSourceReport):
|
|
192
200
|
index_scanned: int = 0
|
|
193
201
|
filtered: LossyList[str] = field(default_factory=LossyList)
|
|
194
202
|
|
|
@@ -240,7 +248,11 @@ def collapse_urn(urn: str, collapse_urns: CollapseUrns) -> str:
|
|
|
240
248
|
)
|
|
241
249
|
|
|
242
250
|
|
|
243
|
-
class ElasticsearchSourceConfig(
|
|
251
|
+
class ElasticsearchSourceConfig(
|
|
252
|
+
StatefulIngestionConfigBase,
|
|
253
|
+
PlatformInstanceConfigMixin,
|
|
254
|
+
EnvConfigMixin,
|
|
255
|
+
):
|
|
244
256
|
host: str = Field(
|
|
245
257
|
default="localhost:9200", description="The elastic search host URI."
|
|
246
258
|
)
|
|
@@ -337,7 +349,7 @@ class ElasticsearchSourceConfig(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
337
349
|
@config_class(ElasticsearchSourceConfig)
|
|
338
350
|
@support_status(SupportStatus.CERTIFIED)
|
|
339
351
|
@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
|
|
340
|
-
class ElasticsearchSource(
|
|
352
|
+
class ElasticsearchSource(StatefulIngestionSourceBase):
|
|
341
353
|
"""
|
|
342
354
|
This plugin extracts the following:
|
|
343
355
|
|
|
@@ -346,7 +358,7 @@ class ElasticsearchSource(Source):
|
|
|
346
358
|
"""
|
|
347
359
|
|
|
348
360
|
def __init__(self, config: ElasticsearchSourceConfig, ctx: PipelineContext):
|
|
349
|
-
super().__init__(ctx)
|
|
361
|
+
super().__init__(config, ctx)
|
|
350
362
|
self.source_config = config
|
|
351
363
|
self.client = Elasticsearch(
|
|
352
364
|
self.source_config.host,
|
|
@@ -361,7 +373,7 @@ class ElasticsearchSource(Source):
|
|
|
361
373
|
ssl_assert_fingerprint=self.source_config.ssl_assert_fingerprint,
|
|
362
374
|
url_prefix=self.source_config.url_prefix,
|
|
363
375
|
)
|
|
364
|
-
self.report = ElasticsearchSourceReport()
|
|
376
|
+
self.report: ElasticsearchSourceReport = ElasticsearchSourceReport()
|
|
365
377
|
self.data_stream_partition_count: Dict[str, int] = defaultdict(int)
|
|
366
378
|
self.platform: str = "elasticsearch"
|
|
367
379
|
self.cat_response: Optional[List[Dict[str, Any]]] = None
|
|
@@ -373,6 +385,14 @@ class ElasticsearchSource(Source):
|
|
|
373
385
|
config = ElasticsearchSourceConfig.parse_obj(config_dict)
|
|
374
386
|
return cls(config, ctx)
|
|
375
387
|
|
|
388
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
389
|
+
return [
|
|
390
|
+
*super().get_workunit_processors(),
|
|
391
|
+
StaleEntityRemovalHandler.create(
|
|
392
|
+
self, self.source_config, self.ctx
|
|
393
|
+
).workunit_processor,
|
|
394
|
+
]
|
|
395
|
+
|
|
376
396
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
377
397
|
indices = self.client.indices.get_alias()
|
|
378
398
|
for index in indices:
|
|
@@ -20,7 +20,6 @@ from feast.data_source import DataSource
|
|
|
20
20
|
from pydantic import Field
|
|
21
21
|
|
|
22
22
|
import datahub.emitter.mce_builder as builder
|
|
23
|
-
from datahub.configuration.common import ConfigModel
|
|
24
23
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
25
24
|
from datahub.ingestion.api.common import PipelineContext
|
|
26
25
|
from datahub.ingestion.api.decorators import (
|
|
@@ -31,8 +30,16 @@ from datahub.ingestion.api.decorators import (
|
|
|
31
30
|
platform_name,
|
|
32
31
|
support_status,
|
|
33
32
|
)
|
|
34
|
-
from datahub.ingestion.api.source import
|
|
33
|
+
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
35
34
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
35
|
+
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
36
|
+
StaleEntityRemovalHandler,
|
|
37
|
+
StaleEntityRemovalSourceReport,
|
|
38
|
+
)
|
|
39
|
+
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
40
|
+
StatefulIngestionConfigBase,
|
|
41
|
+
StatefulIngestionSourceBase,
|
|
42
|
+
)
|
|
36
43
|
from datahub.metadata.com.linkedin.pegasus2avro.common import MLFeatureDataType
|
|
37
44
|
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import (
|
|
38
45
|
MLFeatureSnapshot,
|
|
@@ -86,7 +93,9 @@ _field_type_mapping: Dict[Union[ValueType, feast.types.FeastType], str] = {
|
|
|
86
93
|
}
|
|
87
94
|
|
|
88
95
|
|
|
89
|
-
class FeastRepositorySourceConfig(
|
|
96
|
+
class FeastRepositorySourceConfig(
|
|
97
|
+
StatefulIngestionConfigBase,
|
|
98
|
+
):
|
|
90
99
|
path: str = Field(description="Path to Feast repository")
|
|
91
100
|
fs_yaml_file: Optional[str] = Field(
|
|
92
101
|
default=None,
|
|
@@ -122,7 +131,7 @@ class FeastRepositorySourceConfig(ConfigModel):
|
|
|
122
131
|
@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default")
|
|
123
132
|
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
|
|
124
133
|
@dataclass
|
|
125
|
-
class FeastRepositorySource(
|
|
134
|
+
class FeastRepositorySource(StatefulIngestionSourceBase):
|
|
126
135
|
"""
|
|
127
136
|
This plugin extracts:
|
|
128
137
|
|
|
@@ -135,13 +144,14 @@ class FeastRepositorySource(Source):
|
|
|
135
144
|
|
|
136
145
|
platform = "feast"
|
|
137
146
|
source_config: FeastRepositorySourceConfig
|
|
138
|
-
report:
|
|
147
|
+
report: StaleEntityRemovalSourceReport
|
|
139
148
|
feature_store: FeatureStore
|
|
140
149
|
|
|
141
150
|
def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
|
|
142
|
-
super().__init__(ctx)
|
|
151
|
+
super().__init__(config, ctx)
|
|
143
152
|
self.source_config = config
|
|
144
|
-
self.
|
|
153
|
+
self.ctx = ctx
|
|
154
|
+
self.report = StaleEntityRemovalSourceReport()
|
|
145
155
|
self.feature_store = FeatureStore(
|
|
146
156
|
repo_path=self.source_config.path,
|
|
147
157
|
fs_yaml_file=self.source_config.fs_yaml_file,
|
|
@@ -158,7 +168,8 @@ class FeastRepositorySource(Source):
|
|
|
158
168
|
|
|
159
169
|
if ml_feature_data_type is None:
|
|
160
170
|
self.report.report_warning(
|
|
161
|
-
|
|
171
|
+
"unable to map type",
|
|
172
|
+
f"unable to map type {field_type} to metadata schema to parent: {parent_name}",
|
|
162
173
|
)
|
|
163
174
|
|
|
164
175
|
ml_feature_data_type = MLFeatureDataType.UNKNOWN
|
|
@@ -456,6 +467,14 @@ class FeastRepositorySource(Source):
|
|
|
456
467
|
config = FeastRepositorySourceConfig.parse_obj(config_dict)
|
|
457
468
|
return cls(config, ctx)
|
|
458
469
|
|
|
470
|
+
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
|
|
471
|
+
return [
|
|
472
|
+
*super().get_workunit_processors(),
|
|
473
|
+
StaleEntityRemovalHandler.create(
|
|
474
|
+
self, self.source_config, self.ctx
|
|
475
|
+
).workunit_processor,
|
|
476
|
+
]
|
|
477
|
+
|
|
459
478
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
460
479
|
for feature_view in self.feature_store.list_feature_views():
|
|
461
480
|
for entity_name in feature_view.entities:
|
datahub/ingestion/source/file.py
CHANGED
|
@@ -351,7 +351,7 @@ class GenericFileSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
351
351
|
self.report.add_deserialize_time(deserialize_duration)
|
|
352
352
|
yield i, item
|
|
353
353
|
except Exception as e:
|
|
354
|
-
self.report.report_failure(f"path-{i}", str(e))
|
|
354
|
+
self.report.report_failure(f"{file_status.path}-{i}", str(e))
|
|
355
355
|
|
|
356
356
|
@staticmethod
|
|
357
357
|
def test_connection(config_dict: dict) -> TestConnectionReport:
|
|
@@ -410,10 +410,13 @@ def _from_obj_for_file(
|
|
|
410
410
|
item = MetadataChangeEvent.from_obj(obj)
|
|
411
411
|
elif "aspect" in obj:
|
|
412
412
|
item = MetadataChangeProposalWrapper.from_obj(obj)
|
|
413
|
-
|
|
413
|
+
elif "bucket" in obj:
|
|
414
414
|
item = UsageAggregationClass.from_obj(obj)
|
|
415
|
+
else:
|
|
416
|
+
raise ValueError(f"Unknown object type: {obj}")
|
|
417
|
+
|
|
415
418
|
if not item.validate():
|
|
416
|
-
raise ValueError(f"
|
|
419
|
+
raise ValueError(f"Failed to parse: {obj}")
|
|
417
420
|
|
|
418
421
|
if isinstance(item, UsageAggregationClass):
|
|
419
422
|
logger.warning(f"Dropping deprecated UsageAggregationClass: {item}")
|
|
@@ -498,7 +498,7 @@ class DataProcessCleanup:
|
|
|
498
498
|
# Delete empty dataflows if needed
|
|
499
499
|
if self.config.delete_empty_data_flows:
|
|
500
500
|
deleted_data_flows: int = 0
|
|
501
|
-
for key in dataFlows
|
|
501
|
+
for key in dataFlows:
|
|
502
502
|
if not dataJobs.get(key) or len(dataJobs[key]) == 0:
|
|
503
503
|
logger.info(
|
|
504
504
|
f"Deleting dataflow {key} because there are not datajobs"
|
|
@@ -130,8 +130,9 @@ class DatahubExecutionRequestCleanup:
|
|
|
130
130
|
)
|
|
131
131
|
|
|
132
132
|
def _scroll_execution_requests(
|
|
133
|
-
self, overrides: Dict[str, Any] =
|
|
133
|
+
self, overrides: Optional[Dict[str, Any]] = None
|
|
134
134
|
) -> Iterator[CleanupRecord]:
|
|
135
|
+
overrides = overrides or {}
|
|
135
136
|
headers: Dict[str, Any] = {
|
|
136
137
|
"Accept": "application/json",
|
|
137
138
|
"Content-Type": "application/json",
|
|
@@ -170,14 +170,10 @@ def get_column_unique_count_dh_patch(self: SqlAlchemyDataset, column: str) -> in
|
|
|
170
170
|
).select_from(self._table)
|
|
171
171
|
)
|
|
172
172
|
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
173
|
-
elif
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
)
|
|
178
|
-
)
|
|
179
|
-
return convert_to_json_serializable(element_values.fetchone()[0])
|
|
180
|
-
elif self.engine.dialect.name.lower() == SNOWFLAKE:
|
|
173
|
+
elif (
|
|
174
|
+
self.engine.dialect.name.lower() == BIGQUERY
|
|
175
|
+
or self.engine.dialect.name.lower() == SNOWFLAKE
|
|
176
|
+
):
|
|
181
177
|
element_values = self.engine.execute(
|
|
182
178
|
sa.select(sa.func.APPROX_COUNT_DISTINCT(sa.column(column))).select_from(
|
|
183
179
|
self._table
|
|
@@ -381,13 +377,14 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
381
377
|
col = col_dict["name"]
|
|
382
378
|
self.column_types[col] = str(col_dict["type"])
|
|
383
379
|
# We expect the allow/deny patterns to specify '<table_pattern>.<column_pattern>'
|
|
384
|
-
if
|
|
385
|
-
|
|
380
|
+
if (
|
|
381
|
+
not self.config._allow_deny_patterns.allowed(
|
|
382
|
+
f"{self.dataset_name}.{col}"
|
|
383
|
+
)
|
|
384
|
+
or not self.config.profile_nested_fields
|
|
385
|
+
and "." in col
|
|
386
386
|
):
|
|
387
387
|
ignored_columns_by_pattern.append(col)
|
|
388
|
-
# We try to ignore nested columns as well
|
|
389
|
-
elif not self.config.profile_nested_fields and "." in col:
|
|
390
|
-
ignored_columns_by_pattern.append(col)
|
|
391
388
|
elif col_dict.get("type") and self._should_ignore_column(col_dict["type"]):
|
|
392
389
|
ignored_columns_by_type.append(col)
|
|
393
390
|
else:
|
|
@@ -605,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
605
602
|
if not self.config.include_field_median_value:
|
|
606
603
|
return
|
|
607
604
|
try:
|
|
608
|
-
if self.dataset.engine.dialect.name.lower()
|
|
605
|
+
if self.dataset.engine.dialect.name.lower() in [SNOWFLAKE, DATABRICKS]:
|
|
609
606
|
column_profile.median = str(
|
|
610
607
|
self.dataset.engine.execute(
|
|
611
608
|
sa.select([sa.func.median(sa.column(column))]).select_from(
|
|
@@ -1408,7 +1405,7 @@ class DatahubGEProfiler:
|
|
|
1408
1405
|
},
|
|
1409
1406
|
)
|
|
1410
1407
|
|
|
1411
|
-
if platform
|
|
1408
|
+
if platform in (BIGQUERY, DATABRICKS):
|
|
1412
1409
|
# This is done as GE makes the name as DATASET.TABLE
|
|
1413
1410
|
# but we want it to be PROJECT.DATASET.TABLE instead for multi-project setups
|
|
1414
1411
|
name_parts = pretty_name.split(".")
|
|
@@ -2,8 +2,9 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
import threading
|
|
4
4
|
import uuid
|
|
5
|
-
from typing import Any, Dict, Iterable, List, Optional
|
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
6
6
|
|
|
7
|
+
from dateutil import parser as dateutil_parser
|
|
7
8
|
from pyiceberg.catalog import Catalog
|
|
8
9
|
from pyiceberg.exceptions import (
|
|
9
10
|
NoSuchIcebergTableError,
|
|
@@ -81,6 +82,7 @@ from datahub.metadata.schema_classes import (
|
|
|
81
82
|
OwnerClass,
|
|
82
83
|
OwnershipClass,
|
|
83
84
|
OwnershipTypeClass,
|
|
85
|
+
TimeStampClass,
|
|
84
86
|
)
|
|
85
87
|
from datahub.utilities.perf_timer import PerfTimer
|
|
86
88
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
@@ -183,16 +185,9 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
183
185
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
184
186
|
thread_local = threading.local()
|
|
185
187
|
|
|
186
|
-
def
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
if not self.config.table_pattern.allowed(dataset_name):
|
|
190
|
-
# Dataset name is rejected by pattern, report as dropped.
|
|
191
|
-
self.report.report_dropped(dataset_name)
|
|
192
|
-
LOGGER.debug(
|
|
193
|
-
f"Skipping table {dataset_name} due to not being allowed by the config pattern"
|
|
194
|
-
)
|
|
195
|
-
return
|
|
188
|
+
def _try_processing_dataset(
|
|
189
|
+
dataset_path: Tuple[str, ...], dataset_name: str
|
|
190
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
196
191
|
try:
|
|
197
192
|
if not hasattr(thread_local, "local_catalog"):
|
|
198
193
|
LOGGER.debug(
|
|
@@ -248,10 +243,31 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
248
243
|
LOGGER.warning(
|
|
249
244
|
f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
|
|
250
245
|
)
|
|
246
|
+
except ValueError as e:
|
|
247
|
+
if "Could not initialize FileIO" not in str(e):
|
|
248
|
+
raise
|
|
249
|
+
self.report.warning(
|
|
250
|
+
"Could not initialize FileIO",
|
|
251
|
+
f"Could not initialize FileIO for {dataset_path} due to: {e}",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
|
|
255
|
+
try:
|
|
256
|
+
LOGGER.debug(f"Processing dataset for path {dataset_path}")
|
|
257
|
+
dataset_name = ".".join(dataset_path)
|
|
258
|
+
if not self.config.table_pattern.allowed(dataset_name):
|
|
259
|
+
# Dataset name is rejected by pattern, report as dropped.
|
|
260
|
+
self.report.report_dropped(dataset_name)
|
|
261
|
+
LOGGER.debug(
|
|
262
|
+
f"Skipping table {dataset_name} due to not being allowed by the config pattern"
|
|
263
|
+
)
|
|
264
|
+
return
|
|
265
|
+
|
|
266
|
+
yield from _try_processing_dataset(dataset_path, dataset_name)
|
|
251
267
|
except Exception as e:
|
|
252
268
|
self.report.report_failure(
|
|
253
269
|
"general",
|
|
254
|
-
f"Failed to create workunit for dataset {
|
|
270
|
+
f"Failed to create workunit for dataset {dataset_path}: {e}",
|
|
255
271
|
)
|
|
256
272
|
LOGGER.exception(
|
|
257
273
|
f"Exception while processing table {dataset_path}, skipping it.",
|
|
@@ -288,6 +304,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
288
304
|
)
|
|
289
305
|
|
|
290
306
|
# Dataset properties aspect.
|
|
307
|
+
additional_properties = {}
|
|
291
308
|
custom_properties = table.metadata.properties.copy()
|
|
292
309
|
custom_properties["location"] = table.metadata.location
|
|
293
310
|
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
@@ -299,10 +316,27 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
299
316
|
custom_properties["manifest-list"] = (
|
|
300
317
|
table.current_snapshot().manifest_list
|
|
301
318
|
)
|
|
319
|
+
additional_properties["lastModified"] = TimeStampClass(
|
|
320
|
+
int(table.current_snapshot().timestamp_ms)
|
|
321
|
+
)
|
|
322
|
+
if "created-at" in custom_properties:
|
|
323
|
+
try:
|
|
324
|
+
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
325
|
+
additional_properties["created"] = TimeStampClass(
|
|
326
|
+
int(dt.timestamp() * 1000)
|
|
327
|
+
)
|
|
328
|
+
except Exception as ex:
|
|
329
|
+
LOGGER.warning(
|
|
330
|
+
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
331
|
+
)
|
|
332
|
+
|
|
302
333
|
dataset_properties = DatasetPropertiesClass(
|
|
303
334
|
name=table.name()[-1],
|
|
304
335
|
description=table.metadata.properties.get("comment", None),
|
|
305
336
|
customProperties=custom_properties,
|
|
337
|
+
lastModified=additional_properties.get("lastModified"),
|
|
338
|
+
created=additional_properties.get("created"),
|
|
339
|
+
qualifiedName=dataset_name,
|
|
306
340
|
)
|
|
307
341
|
dataset_snapshot.aspects.append(dataset_properties)
|
|
308
342
|
# Dataset ownership aspect.
|