acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2391 -2392
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +105 -88
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/cli/specific/dataset_cli.py +26 -10
- datahub/emitter/mce_builder.py +1 -3
- datahub/emitter/mcp_builder.py +8 -0
- datahub/emitter/request_helper.py +19 -14
- datahub/emitter/response_helper.py +25 -18
- datahub/emitter/rest_emitter.py +23 -7
- datahub/errors.py +8 -0
- datahub/ingestion/api/source.py +7 -2
- datahub/ingestion/api/source_helpers.py +14 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/graph/client.py +26 -20
- datahub/ingestion/graph/filters.py +62 -17
- datahub/ingestion/sink/datahub_rest.py +2 -2
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +17 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +193 -140
- datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
- datahub/ingestion/source/mlflow.py +217 -8
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/config.py +31 -4
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +41 -24
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
- datahub/ingestion/source/redshift/lineage_v2.py +9 -1
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/sigma/config.py +3 -4
- datahub/ingestion/source/sigma/sigma.py +10 -6
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/oracle.py +51 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +291 -35
- datahub/ingestion/source/usage/usage_common.py +0 -65
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +313 -2
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +24 -1
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +94 -37
- datahub/sql_parsing/split_statements.py +17 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/testing/mcp_diff.py +1 -18
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/ingestion/source/vertexai.py +0 -697
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -38,6 +38,7 @@ from pyiceberg.types import (
|
|
|
38
38
|
)
|
|
39
39
|
|
|
40
40
|
from datahub.emitter.mce_builder import (
|
|
41
|
+
make_container_urn,
|
|
41
42
|
make_data_platform_urn,
|
|
42
43
|
make_dataplatform_instance_urn,
|
|
43
44
|
make_dataset_urn_with_platform_instance,
|
|
@@ -45,6 +46,7 @@ from datahub.emitter.mce_builder import (
|
|
|
45
46
|
make_user_urn,
|
|
46
47
|
)
|
|
47
48
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
49
|
+
from datahub.emitter.mcp_builder import NamespaceKey
|
|
48
50
|
from datahub.ingestion.api.common import PipelineContext
|
|
49
51
|
from datahub.ingestion.api.decorators import (
|
|
50
52
|
SourceCapability,
|
|
@@ -57,6 +59,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
57
59
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
58
60
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
59
61
|
from datahub.ingestion.extractor import schema_util
|
|
62
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
63
|
+
DatasetContainerSubTypes,
|
|
64
|
+
DatasetSubTypes,
|
|
65
|
+
)
|
|
60
66
|
from datahub.ingestion.source.iceberg.iceberg_common import (
|
|
61
67
|
IcebergSourceConfig,
|
|
62
68
|
IcebergSourceReport,
|
|
@@ -68,21 +74,22 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
68
74
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
69
75
|
StatefulIngestionSourceBase,
|
|
70
76
|
)
|
|
71
|
-
from datahub.metadata.com.linkedin.pegasus2avro.common import Status
|
|
72
|
-
from datahub.metadata.com.linkedin.pegasus2avro.
|
|
73
|
-
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
77
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import Status, SubTypes
|
|
78
|
+
from datahub.metadata.com.linkedin.pegasus2avro.container import ContainerProperties
|
|
74
79
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
75
80
|
OtherSchema,
|
|
76
81
|
SchemaField,
|
|
77
82
|
SchemaMetadata,
|
|
78
83
|
)
|
|
79
84
|
from datahub.metadata.schema_classes import (
|
|
85
|
+
ContainerClass,
|
|
80
86
|
DataPlatformInstanceClass,
|
|
81
87
|
DatasetPropertiesClass,
|
|
82
88
|
OwnerClass,
|
|
83
89
|
OwnershipClass,
|
|
84
90
|
OwnershipTypeClass,
|
|
85
91
|
TimeStampClass,
|
|
92
|
+
_Aspect,
|
|
86
93
|
)
|
|
87
94
|
from datahub.utilities.perf_timer import PerfTimer
|
|
88
95
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
@@ -121,9 +128,10 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
121
128
|
[pyiceberg library](https://py.iceberg.apache.org/).
|
|
122
129
|
"""
|
|
123
130
|
|
|
131
|
+
platform: str = "iceberg"
|
|
132
|
+
|
|
124
133
|
def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None:
|
|
125
134
|
super().__init__(config, ctx)
|
|
126
|
-
self.platform: str = "iceberg"
|
|
127
135
|
self.report: IcebergSourceReport = IcebergSourceReport()
|
|
128
136
|
self.config: IcebergSourceConfig = config
|
|
129
137
|
|
|
@@ -140,13 +148,12 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
140
148
|
).workunit_processor,
|
|
141
149
|
]
|
|
142
150
|
|
|
143
|
-
def
|
|
151
|
+
def _get_namespaces(self, catalog: Catalog) -> Iterable[Identifier]:
|
|
144
152
|
namespaces = catalog.list_namespaces()
|
|
145
153
|
LOGGER.debug(
|
|
146
154
|
f"Retrieved {len(namespaces)} namespaces, first 10: {namespaces[:10]}"
|
|
147
155
|
)
|
|
148
156
|
self.report.report_no_listed_namespaces(len(namespaces))
|
|
149
|
-
tables_count = 0
|
|
150
157
|
for namespace in namespaces:
|
|
151
158
|
namespace_repr = ".".join(namespace)
|
|
152
159
|
if not self.config.namespace_pattern.allowed(namespace_repr):
|
|
@@ -155,6 +162,14 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
155
162
|
)
|
|
156
163
|
self.report.report_dropped(f"{namespace_repr}.*")
|
|
157
164
|
continue
|
|
165
|
+
yield namespace
|
|
166
|
+
|
|
167
|
+
def _get_datasets(
|
|
168
|
+
self, catalog: Catalog, namespaces: Iterable[Tuple[Identifier, str]]
|
|
169
|
+
) -> Iterable[Tuple[Identifier, str]]:
|
|
170
|
+
LOGGER.debug("Starting to retrieve tables")
|
|
171
|
+
tables_count = 0
|
|
172
|
+
for namespace, namespace_urn in namespaces:
|
|
158
173
|
try:
|
|
159
174
|
tables = catalog.list_tables(namespace)
|
|
160
175
|
tables_count += len(tables)
|
|
@@ -164,29 +179,27 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
164
179
|
self.report.report_listed_tables_for_namespace(
|
|
165
180
|
".".join(namespace), len(tables)
|
|
166
181
|
)
|
|
167
|
-
yield from tables
|
|
168
|
-
except NoSuchNamespaceError:
|
|
169
|
-
self.report.
|
|
170
|
-
"
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
f"NoSuchNamespaceError exception while trying to get list of tables from namespace {namespace}, skipping it",
|
|
182
|
+
yield from [(table, namespace_urn) for table in tables]
|
|
183
|
+
except NoSuchNamespaceError as e:
|
|
184
|
+
self.report.warning(
|
|
185
|
+
title="No such namespace",
|
|
186
|
+
message="Skipping the missing namespace.",
|
|
187
|
+
context=str(namespace),
|
|
188
|
+
exc=e,
|
|
175
189
|
)
|
|
176
190
|
except Exception as e:
|
|
177
191
|
self.report.report_failure(
|
|
178
|
-
"
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
f"Unexpected exception while trying to get list of tables for namespace {namespace}, skipping it"
|
|
192
|
+
title="Error when processing a namespace",
|
|
193
|
+
message="Skipping the namespace due to errors while processing it.",
|
|
194
|
+
context=str(namespace),
|
|
195
|
+
exc=e,
|
|
183
196
|
)
|
|
184
197
|
|
|
185
198
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
186
199
|
thread_local = threading.local()
|
|
187
200
|
|
|
188
201
|
def _try_processing_dataset(
|
|
189
|
-
dataset_path: Tuple[str, ...], dataset_name: str
|
|
202
|
+
dataset_path: Tuple[str, ...], dataset_name: str, namespace_urn: str
|
|
190
203
|
) -> Iterable[MetadataWorkUnit]:
|
|
191
204
|
try:
|
|
192
205
|
if not hasattr(thread_local, "local_catalog"):
|
|
@@ -202,56 +215,66 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
202
215
|
time_taken, dataset_name, table.metadata_location
|
|
203
216
|
)
|
|
204
217
|
LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
218
|
+
dataset_urn: str = make_dataset_urn_with_platform_instance(
|
|
219
|
+
self.platform,
|
|
220
|
+
dataset_name,
|
|
221
|
+
self.config.platform_instance,
|
|
222
|
+
self.config.env,
|
|
210
223
|
)
|
|
211
|
-
|
|
212
|
-
|
|
224
|
+
for aspect in self._create_iceberg_table_aspects(
|
|
225
|
+
dataset_name, table, namespace_urn
|
|
226
|
+
):
|
|
227
|
+
yield MetadataChangeProposalWrapper(
|
|
228
|
+
entityUrn=dataset_urn, aspect=aspect
|
|
229
|
+
).as_workunit()
|
|
230
|
+
except NoSuchPropertyException as e:
|
|
231
|
+
self.report.warning(
|
|
232
|
+
title="Unable to process table",
|
|
233
|
+
message="Table was not processed due to expected property missing (table is probably not an iceberg table).",
|
|
234
|
+
context=dataset_name,
|
|
235
|
+
exc=e,
|
|
213
236
|
)
|
|
214
237
|
except NoSuchIcebergTableError as e:
|
|
215
|
-
self.report.
|
|
216
|
-
"
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
|
|
238
|
+
self.report.warning(
|
|
239
|
+
title="Skipped non-iceberg table",
|
|
240
|
+
message="Table was recognized as non-iceberg and skipped.",
|
|
241
|
+
context=dataset_name,
|
|
242
|
+
exc=e,
|
|
221
243
|
)
|
|
222
244
|
except NoSuchTableError as e:
|
|
223
|
-
self.report.
|
|
224
|
-
"
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
f"NoSuchTableError while processing table {dataset_path}, skipping it.",
|
|
245
|
+
self.report.warning(
|
|
246
|
+
title="Table not found",
|
|
247
|
+
message="Table was returned by the catalog in the list of table but catalog can't find its details, table was skipped.",
|
|
248
|
+
context=dataset_name,
|
|
249
|
+
exc=e,
|
|
229
250
|
)
|
|
230
251
|
except FileNotFoundError as e:
|
|
231
|
-
self.report.
|
|
232
|
-
"file
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
f"FileNotFoundError while processing table {dataset_path}, skipping it."
|
|
252
|
+
self.report.warning(
|
|
253
|
+
title="Manifest file not found",
|
|
254
|
+
message="Couldn't find manifest file to read for the table, skipping it.",
|
|
255
|
+
context=dataset_name,
|
|
256
|
+
exc=e,
|
|
237
257
|
)
|
|
238
258
|
except ServerError as e:
|
|
239
|
-
self.report.
|
|
240
|
-
"
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
|
|
259
|
+
self.report.warning(
|
|
260
|
+
title="Iceberg REST Server Error",
|
|
261
|
+
message="Iceberg returned 500 HTTP status when trying to process a table, skipping it.",
|
|
262
|
+
context=dataset_name,
|
|
263
|
+
exc=e,
|
|
245
264
|
)
|
|
246
265
|
except ValueError as e:
|
|
247
266
|
if "Could not initialize FileIO" not in str(e):
|
|
248
267
|
raise
|
|
249
268
|
self.report.warning(
|
|
250
|
-
"Could not initialize FileIO",
|
|
251
|
-
|
|
269
|
+
title="Could not initialize FileIO",
|
|
270
|
+
message="Could not initialize FileIO for a table (are you using custom FileIO?). Skipping the table.",
|
|
271
|
+
context=dataset_name,
|
|
272
|
+
exc=e,
|
|
252
273
|
)
|
|
253
274
|
|
|
254
|
-
def _process_dataset(
|
|
275
|
+
def _process_dataset(
|
|
276
|
+
dataset_path: Identifier, namespace_urn: str
|
|
277
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
255
278
|
try:
|
|
256
279
|
LOGGER.debug(f"Processing dataset for path {dataset_path}")
|
|
257
280
|
dataset_name = ".".join(dataset_path)
|
|
@@ -263,106 +286,96 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
263
286
|
)
|
|
264
287
|
return
|
|
265
288
|
|
|
266
|
-
yield from _try_processing_dataset(
|
|
289
|
+
yield from _try_processing_dataset(
|
|
290
|
+
dataset_path, dataset_name, namespace_urn
|
|
291
|
+
)
|
|
267
292
|
except Exception as e:
|
|
268
293
|
self.report.report_failure(
|
|
269
|
-
"
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
f"Exception while processing table {dataset_path}, skipping it.",
|
|
294
|
+
title="Error when processing a table",
|
|
295
|
+
message="Skipping the table due to errors when processing it.",
|
|
296
|
+
context=str(dataset_path),
|
|
297
|
+
exc=e,
|
|
274
298
|
)
|
|
275
299
|
|
|
276
300
|
try:
|
|
277
301
|
catalog = self.config.get_catalog()
|
|
278
302
|
except Exception as e:
|
|
279
|
-
self.report.report_failure(
|
|
303
|
+
self.report.report_failure(
|
|
304
|
+
title="Failed to initialize catalog object",
|
|
305
|
+
message="Couldn't start the ingestion due to failure to initialize catalog object.",
|
|
306
|
+
exc=e,
|
|
307
|
+
)
|
|
308
|
+
return
|
|
309
|
+
|
|
310
|
+
try:
|
|
311
|
+
namespace_ids = self._get_namespaces(catalog)
|
|
312
|
+
namespaces: List[Tuple[Identifier, str]] = []
|
|
313
|
+
for namespace in namespace_ids:
|
|
314
|
+
namespace_repr = ".".join(namespace)
|
|
315
|
+
LOGGER.debug(f"Processing namespace {namespace_repr}")
|
|
316
|
+
namespace_urn = make_container_urn(
|
|
317
|
+
NamespaceKey(
|
|
318
|
+
namespace=namespace_repr,
|
|
319
|
+
platform=self.platform,
|
|
320
|
+
instance=self.config.platform_instance,
|
|
321
|
+
env=self.config.env,
|
|
322
|
+
)
|
|
323
|
+
)
|
|
324
|
+
namespaces.append((namespace, namespace_urn))
|
|
325
|
+
for aspect in self._create_iceberg_namespace_aspects(namespace):
|
|
326
|
+
yield MetadataChangeProposalWrapper(
|
|
327
|
+
entityUrn=namespace_urn, aspect=aspect
|
|
328
|
+
).as_workunit()
|
|
329
|
+
LOGGER.debug("Namespaces ingestion completed")
|
|
330
|
+
except Exception as e:
|
|
331
|
+
self.report.report_failure(
|
|
332
|
+
title="Failed to list namespaces",
|
|
333
|
+
message="Couldn't start the ingestion due to a failure to process the list of the namespaces",
|
|
334
|
+
exc=e,
|
|
335
|
+
)
|
|
280
336
|
return
|
|
281
337
|
|
|
282
338
|
for wu in ThreadedIteratorExecutor.process(
|
|
283
339
|
worker_func=_process_dataset,
|
|
284
|
-
args_list=[
|
|
340
|
+
args_list=[
|
|
341
|
+
(dataset_path, namespace_urn)
|
|
342
|
+
for dataset_path, namespace_urn in self._get_datasets(
|
|
343
|
+
catalog, namespaces
|
|
344
|
+
)
|
|
345
|
+
],
|
|
285
346
|
max_workers=self.config.processing_threads,
|
|
286
347
|
):
|
|
287
348
|
yield wu
|
|
288
349
|
|
|
289
|
-
def
|
|
290
|
-
self, dataset_name: str, table: Table
|
|
291
|
-
) -> Iterable[
|
|
350
|
+
def _create_iceberg_table_aspects(
|
|
351
|
+
self, dataset_name: str, table: Table, namespace_urn: str
|
|
352
|
+
) -> Iterable[_Aspect]:
|
|
292
353
|
with PerfTimer() as timer:
|
|
293
354
|
self.report.report_table_scanned(dataset_name)
|
|
294
355
|
LOGGER.debug(f"Processing table {dataset_name}")
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
dataset_name,
|
|
298
|
-
self.config.platform_instance,
|
|
299
|
-
self.config.env,
|
|
300
|
-
)
|
|
301
|
-
dataset_snapshot = DatasetSnapshot(
|
|
302
|
-
urn=dataset_urn,
|
|
303
|
-
aspects=[Status(removed=False)],
|
|
304
|
-
)
|
|
356
|
+
yield Status(removed=False)
|
|
357
|
+
yield SubTypes(typeNames=[DatasetSubTypes.TABLE])
|
|
305
358
|
|
|
306
|
-
|
|
307
|
-
additional_properties = {}
|
|
308
|
-
custom_properties = table.metadata.properties.copy()
|
|
309
|
-
custom_properties["location"] = table.metadata.location
|
|
310
|
-
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
311
|
-
custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
|
|
312
|
-
if table.current_snapshot():
|
|
313
|
-
custom_properties["snapshot-id"] = str(
|
|
314
|
-
table.current_snapshot().snapshot_id
|
|
315
|
-
)
|
|
316
|
-
custom_properties["manifest-list"] = (
|
|
317
|
-
table.current_snapshot().manifest_list
|
|
318
|
-
)
|
|
319
|
-
additional_properties["lastModified"] = TimeStampClass(
|
|
320
|
-
int(table.current_snapshot().timestamp_ms)
|
|
321
|
-
)
|
|
322
|
-
if "created-at" in custom_properties:
|
|
323
|
-
try:
|
|
324
|
-
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
325
|
-
additional_properties["created"] = TimeStampClass(
|
|
326
|
-
int(dt.timestamp() * 1000)
|
|
327
|
-
)
|
|
328
|
-
except Exception as ex:
|
|
329
|
-
LOGGER.warning(
|
|
330
|
-
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
331
|
-
)
|
|
359
|
+
yield self._get_dataset_properties_aspect(dataset_name, table)
|
|
332
360
|
|
|
333
|
-
dataset_properties = DatasetPropertiesClass(
|
|
334
|
-
name=table.name()[-1],
|
|
335
|
-
description=table.metadata.properties.get("comment", None),
|
|
336
|
-
customProperties=custom_properties,
|
|
337
|
-
lastModified=additional_properties.get("lastModified"),
|
|
338
|
-
created=additional_properties.get("created"),
|
|
339
|
-
qualifiedName=dataset_name,
|
|
340
|
-
)
|
|
341
|
-
dataset_snapshot.aspects.append(dataset_properties)
|
|
342
|
-
# Dataset ownership aspect.
|
|
343
361
|
dataset_ownership = self._get_ownership_aspect(table)
|
|
344
362
|
if dataset_ownership:
|
|
345
363
|
LOGGER.debug(
|
|
346
364
|
f"Adding ownership: {dataset_ownership} to the dataset {dataset_name}"
|
|
347
365
|
)
|
|
348
|
-
|
|
366
|
+
yield dataset_ownership
|
|
349
367
|
|
|
350
|
-
|
|
351
|
-
|
|
368
|
+
yield self._create_schema_metadata(dataset_name, table)
|
|
369
|
+
yield self._get_dataplatform_instance_aspect()
|
|
370
|
+
yield ContainerClass(container=str(namespace_urn))
|
|
352
371
|
|
|
353
|
-
mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
|
|
354
372
|
self.report.report_table_processing_time(
|
|
355
373
|
timer.elapsed_seconds(), dataset_name, table.metadata_location
|
|
356
374
|
)
|
|
357
|
-
yield MetadataWorkUnit(id=dataset_name, mce=mce)
|
|
358
|
-
|
|
359
|
-
dpi_aspect = self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
360
|
-
if dpi_aspect:
|
|
361
|
-
yield dpi_aspect
|
|
362
375
|
|
|
363
376
|
if self.config.is_profiling_enabled():
|
|
364
377
|
profiler = IcebergProfiler(self.report, self.config.profiling)
|
|
365
|
-
yield from profiler.profile_table(dataset_name,
|
|
378
|
+
yield from profiler.profile_table(dataset_name, table)
|
|
366
379
|
|
|
367
380
|
def _get_partition_aspect(self, table: Table) -> Optional[str]:
|
|
368
381
|
"""Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
|
|
@@ -401,12 +414,48 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
401
414
|
]
|
|
402
415
|
)
|
|
403
416
|
except Exception as e:
|
|
404
|
-
self.report.
|
|
405
|
-
"extract
|
|
406
|
-
|
|
417
|
+
self.report.warning(
|
|
418
|
+
title="Failed to extract partition information",
|
|
419
|
+
message="Failed to extract partition information for a table. Table metadata will be ingested without it.",
|
|
420
|
+
context=str(table.name),
|
|
421
|
+
exc=e,
|
|
407
422
|
)
|
|
408
423
|
return None
|
|
409
424
|
|
|
425
|
+
def _get_dataset_properties_aspect(
|
|
426
|
+
self, dataset_name: str, table: Table
|
|
427
|
+
) -> DatasetPropertiesClass:
|
|
428
|
+
additional_properties = {}
|
|
429
|
+
custom_properties = table.metadata.properties.copy()
|
|
430
|
+
custom_properties["location"] = table.metadata.location
|
|
431
|
+
custom_properties["format-version"] = str(table.metadata.format_version)
|
|
432
|
+
custom_properties["partition-spec"] = str(self._get_partition_aspect(table))
|
|
433
|
+
if table.current_snapshot():
|
|
434
|
+
custom_properties["snapshot-id"] = str(table.current_snapshot().snapshot_id)
|
|
435
|
+
custom_properties["manifest-list"] = table.current_snapshot().manifest_list
|
|
436
|
+
additional_properties["lastModified"] = TimeStampClass(
|
|
437
|
+
int(table.current_snapshot().timestamp_ms)
|
|
438
|
+
)
|
|
439
|
+
if "created-at" in custom_properties:
|
|
440
|
+
try:
|
|
441
|
+
dt = dateutil_parser.isoparse(custom_properties["created-at"])
|
|
442
|
+
additional_properties["created"] = TimeStampClass(
|
|
443
|
+
int(dt.timestamp() * 1000)
|
|
444
|
+
)
|
|
445
|
+
except Exception as ex:
|
|
446
|
+
LOGGER.warning(
|
|
447
|
+
f"Exception while trying to parse creation date {custom_properties['created-at']}, ignoring: {ex}"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
return DatasetPropertiesClass(
|
|
451
|
+
name=table.name()[-1],
|
|
452
|
+
description=table.metadata.properties.get("comment", None),
|
|
453
|
+
customProperties=custom_properties,
|
|
454
|
+
lastModified=additional_properties.get("lastModified"),
|
|
455
|
+
created=additional_properties.get("created"),
|
|
456
|
+
qualifiedName=dataset_name,
|
|
457
|
+
)
|
|
458
|
+
|
|
410
459
|
def _get_ownership_aspect(self, table: Table) -> Optional[OwnershipClass]:
|
|
411
460
|
owners = []
|
|
412
461
|
if self.config.user_ownership_property:
|
|
@@ -435,22 +484,15 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
435
484
|
)
|
|
436
485
|
return OwnershipClass(owners=owners) if owners else None
|
|
437
486
|
|
|
438
|
-
def _get_dataplatform_instance_aspect(
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
instance=make_dataplatform_instance_urn(
|
|
448
|
-
self.platform, self.config.platform_instance
|
|
449
|
-
),
|
|
450
|
-
),
|
|
451
|
-
).as_workunit()
|
|
452
|
-
|
|
453
|
-
return None
|
|
487
|
+
def _get_dataplatform_instance_aspect(self) -> DataPlatformInstanceClass:
|
|
488
|
+
return DataPlatformInstanceClass(
|
|
489
|
+
platform=make_data_platform_urn(self.platform),
|
|
490
|
+
instance=make_dataplatform_instance_urn(
|
|
491
|
+
self.platform, self.config.platform_instance
|
|
492
|
+
)
|
|
493
|
+
if self.config.platform_instance
|
|
494
|
+
else None,
|
|
495
|
+
)
|
|
454
496
|
|
|
455
497
|
def _create_schema_metadata(
|
|
456
498
|
self, dataset_name: str, table: Table
|
|
@@ -479,6 +521,17 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
479
521
|
def get_report(self) -> SourceReport:
|
|
480
522
|
return self.report
|
|
481
523
|
|
|
524
|
+
def _create_iceberg_namespace_aspects(
|
|
525
|
+
self, namespace: Identifier
|
|
526
|
+
) -> Iterable[_Aspect]:
|
|
527
|
+
namespace_repr = ".".join(namespace)
|
|
528
|
+
yield Status(removed=False)
|
|
529
|
+
yield ContainerProperties(
|
|
530
|
+
name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
|
|
531
|
+
)
|
|
532
|
+
yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
|
|
533
|
+
yield self._get_dataplatform_instance_aspect()
|
|
534
|
+
|
|
482
535
|
|
|
483
536
|
class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
484
537
|
"""Implementation of a visitor to build an Avro schema as a dictionary from an Iceberg schema."""
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Callable, Dict, Iterable,
|
|
2
|
+
from typing import Any, Callable, Dict, Iterable, Optional, cast
|
|
3
3
|
|
|
4
4
|
from pyiceberg.conversions import from_bytes
|
|
5
5
|
from pyiceberg.schema import Schema
|
|
@@ -24,8 +24,6 @@ from pyiceberg.utils.datetime import (
|
|
|
24
24
|
)
|
|
25
25
|
|
|
26
26
|
from datahub.emitter.mce_builder import get_sys_time
|
|
27
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
28
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
29
27
|
from datahub.ingestion.source.iceberg.iceberg_common import (
|
|
30
28
|
IcebergProfilingConfig,
|
|
31
29
|
IcebergSourceReport,
|
|
@@ -33,6 +31,7 @@ from datahub.ingestion.source.iceberg.iceberg_common import (
|
|
|
33
31
|
from datahub.metadata.schema_classes import (
|
|
34
32
|
DatasetFieldProfileClass,
|
|
35
33
|
DatasetProfileClass,
|
|
34
|
+
_Aspect,
|
|
36
35
|
)
|
|
37
36
|
from datahub.utilities.perf_timer import PerfTimer
|
|
38
37
|
|
|
@@ -86,9 +85,8 @@ class IcebergProfiler:
|
|
|
86
85
|
def profile_table(
|
|
87
86
|
self,
|
|
88
87
|
dataset_name: str,
|
|
89
|
-
dataset_urn: str,
|
|
90
88
|
table: Table,
|
|
91
|
-
) -> Iterable[
|
|
89
|
+
) -> Iterable[_Aspect]:
|
|
92
90
|
"""This method will profile the supplied Iceberg table by looking at the table's manifest.
|
|
93
91
|
|
|
94
92
|
The overall profile of the table is aggregated from the individual manifest files.
|
|
@@ -167,11 +165,11 @@ class IcebergProfiler:
|
|
|
167
165
|
)
|
|
168
166
|
total_count += data_file.record_count
|
|
169
167
|
except Exception as e:
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
168
|
+
self.report.warning(
|
|
169
|
+
title="Error when profiling a table",
|
|
170
|
+
message="Skipping profiling of the table due to errors",
|
|
171
|
+
context=dataset_name,
|
|
172
|
+
exc=e,
|
|
175
173
|
)
|
|
176
174
|
if row_count:
|
|
177
175
|
# Iterating through fieldPaths introduces unwanted stats for list element fields...
|
|
@@ -211,14 +209,11 @@ class IcebergProfiler:
|
|
|
211
209
|
f"Finished profiling of dataset: {dataset_name} in {time_taken}"
|
|
212
210
|
)
|
|
213
211
|
|
|
214
|
-
yield
|
|
215
|
-
entityUrn=dataset_urn,
|
|
216
|
-
aspect=dataset_profile,
|
|
217
|
-
).as_workunit()
|
|
212
|
+
yield dataset_profile
|
|
218
213
|
|
|
219
214
|
def _render_value(
|
|
220
215
|
self, dataset_name: str, value_type: IcebergType, value: Any
|
|
221
|
-
) ->
|
|
216
|
+
) -> Optional[str]:
|
|
222
217
|
try:
|
|
223
218
|
if isinstance(value_type, TimestampType):
|
|
224
219
|
return to_human_timestamp(value)
|
|
@@ -230,9 +225,17 @@ class IcebergProfiler:
|
|
|
230
225
|
return to_human_time(value)
|
|
231
226
|
return str(value)
|
|
232
227
|
except Exception as e:
|
|
233
|
-
self.report.
|
|
234
|
-
"profiling",
|
|
235
|
-
|
|
228
|
+
self.report.warning(
|
|
229
|
+
title="Couldn't render value when profiling a table",
|
|
230
|
+
message="Encountered error, when trying to redner a value for table profile.",
|
|
231
|
+
context=str(
|
|
232
|
+
{
|
|
233
|
+
"value": value,
|
|
234
|
+
"value_type": value_type,
|
|
235
|
+
"dataset_name": dataset_name,
|
|
236
|
+
}
|
|
237
|
+
),
|
|
238
|
+
exc=e,
|
|
236
239
|
)
|
|
237
240
|
return None
|
|
238
241
|
|