acryl-datahub 1.0.0.1rc5__py3-none-any.whl → 1.0.0.1rc7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc7.dist-info}/METADATA +2451 -2451
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc7.dist-info}/RECORD +32 -32
- datahub/_version.py +1 -1
- datahub/cli/specific/dataset_cli.py +26 -10
- datahub/emitter/mcp_builder.py +8 -0
- datahub/emitter/rest_emitter.py +13 -5
- datahub/errors.py +4 -0
- datahub/ingestion/api/source.py +2 -1
- datahub/ingestion/api/source_helpers.py +9 -1
- datahub/ingestion/graph/client.py +20 -9
- datahub/ingestion/graph/filters.py +41 -16
- datahub/ingestion/sink/datahub_rest.py +2 -2
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/common/subtypes.py +1 -0
- datahub/ingestion/source/iceberg/iceberg.py +159 -102
- datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
- datahub/ingestion/source/powerbi/config.py +31 -4
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +12 -1
- datahub/ingestion/source/sigma/config.py +3 -4
- datahub/ingestion/source/sigma/sigma.py +10 -6
- datahub/ingestion/source/sql/oracle.py +51 -4
- datahub/ingestion/source/usage/usage_common.py +0 -65
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +73 -11
- datahub/utilities/threaded_iterator_executor.py +16 -3
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc7.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc5.dist-info → acryl_datahub-1.0.0.1rc7.dist-info}/top_level.txt +0 -0
|
@@ -123,16 +123,7 @@ class CassandraSource(StatefulIngestionSourceBase):
|
|
|
123
123
|
).workunit_processor,
|
|
124
124
|
]
|
|
125
125
|
|
|
126
|
-
def get_workunits_internal(
|
|
127
|
-
self,
|
|
128
|
-
) -> Iterable[MetadataWorkUnit]:
|
|
129
|
-
for metadata in self._get_metadata():
|
|
130
|
-
if isinstance(metadata, MetadataWorkUnit):
|
|
131
|
-
yield metadata
|
|
132
|
-
else:
|
|
133
|
-
yield from metadata.as_workunits()
|
|
134
|
-
|
|
135
|
-
def _get_metadata(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
126
|
+
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
136
127
|
if not self.cassandra_api.authenticate():
|
|
137
128
|
return
|
|
138
129
|
keyspaces: List[CassandraKeyspace] = self.cassandra_api.get_keyspaces()
|
|
@@ -38,6 +38,7 @@ from pyiceberg.types import (
|
|
|
38
38
|
)
|
|
39
39
|
|
|
40
40
|
from datahub.emitter.mce_builder import (
|
|
41
|
+
make_container_urn,
|
|
41
42
|
make_data_platform_urn,
|
|
42
43
|
make_dataplatform_instance_urn,
|
|
43
44
|
make_dataset_urn_with_platform_instance,
|
|
@@ -45,6 +46,7 @@ from datahub.emitter.mce_builder import (
|
|
|
45
46
|
make_user_urn,
|
|
46
47
|
)
|
|
47
48
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
49
|
+
from datahub.emitter.mcp_builder import NamespaceKey
|
|
48
50
|
from datahub.ingestion.api.common import PipelineContext
|
|
49
51
|
from datahub.ingestion.api.decorators import (
|
|
50
52
|
SourceCapability,
|
|
@@ -57,6 +59,10 @@ from datahub.ingestion.api.decorators import (
|
|
|
57
59
|
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
|
58
60
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
59
61
|
from datahub.ingestion.extractor import schema_util
|
|
62
|
+
from datahub.ingestion.source.common.subtypes import (
|
|
63
|
+
DatasetContainerSubTypes,
|
|
64
|
+
DatasetSubTypes,
|
|
65
|
+
)
|
|
60
66
|
from datahub.ingestion.source.iceberg.iceberg_common import (
|
|
61
67
|
IcebergSourceConfig,
|
|
62
68
|
IcebergSourceReport,
|
|
@@ -68,19 +74,22 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
68
74
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
69
75
|
StatefulIngestionSourceBase,
|
|
70
76
|
)
|
|
71
|
-
from datahub.metadata.com.linkedin.pegasus2avro.common import Status
|
|
77
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import Status, SubTypes
|
|
78
|
+
from datahub.metadata.com.linkedin.pegasus2avro.container import ContainerProperties
|
|
72
79
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|
73
80
|
OtherSchema,
|
|
74
81
|
SchemaField,
|
|
75
82
|
SchemaMetadata,
|
|
76
83
|
)
|
|
77
84
|
from datahub.metadata.schema_classes import (
|
|
85
|
+
ContainerClass,
|
|
78
86
|
DataPlatformInstanceClass,
|
|
79
87
|
DatasetPropertiesClass,
|
|
80
88
|
OwnerClass,
|
|
81
89
|
OwnershipClass,
|
|
82
90
|
OwnershipTypeClass,
|
|
83
91
|
TimeStampClass,
|
|
92
|
+
_Aspect,
|
|
84
93
|
)
|
|
85
94
|
from datahub.utilities.perf_timer import PerfTimer
|
|
86
95
|
from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
|
|
@@ -119,9 +128,10 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
119
128
|
[pyiceberg library](https://py.iceberg.apache.org/).
|
|
120
129
|
"""
|
|
121
130
|
|
|
131
|
+
platform: str = "iceberg"
|
|
132
|
+
|
|
122
133
|
def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None:
|
|
123
134
|
super().__init__(config, ctx)
|
|
124
|
-
self.platform: str = "iceberg"
|
|
125
135
|
self.report: IcebergSourceReport = IcebergSourceReport()
|
|
126
136
|
self.config: IcebergSourceConfig = config
|
|
127
137
|
|
|
@@ -138,13 +148,12 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
138
148
|
).workunit_processor,
|
|
139
149
|
]
|
|
140
150
|
|
|
141
|
-
def
|
|
151
|
+
def _get_namespaces(self, catalog: Catalog) -> Iterable[Identifier]:
|
|
142
152
|
namespaces = catalog.list_namespaces()
|
|
143
153
|
LOGGER.debug(
|
|
144
154
|
f"Retrieved {len(namespaces)} namespaces, first 10: {namespaces[:10]}"
|
|
145
155
|
)
|
|
146
156
|
self.report.report_no_listed_namespaces(len(namespaces))
|
|
147
|
-
tables_count = 0
|
|
148
157
|
for namespace in namespaces:
|
|
149
158
|
namespace_repr = ".".join(namespace)
|
|
150
159
|
if not self.config.namespace_pattern.allowed(namespace_repr):
|
|
@@ -153,6 +162,14 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
153
162
|
)
|
|
154
163
|
self.report.report_dropped(f"{namespace_repr}.*")
|
|
155
164
|
continue
|
|
165
|
+
yield namespace
|
|
166
|
+
|
|
167
|
+
def _get_datasets(
|
|
168
|
+
self, catalog: Catalog, namespaces: Iterable[Tuple[Identifier, str]]
|
|
169
|
+
) -> Iterable[Tuple[Identifier, str]]:
|
|
170
|
+
LOGGER.debug("Starting to retrieve tables")
|
|
171
|
+
tables_count = 0
|
|
172
|
+
for namespace, namespace_urn in namespaces:
|
|
156
173
|
try:
|
|
157
174
|
tables = catalog.list_tables(namespace)
|
|
158
175
|
tables_count += len(tables)
|
|
@@ -162,29 +179,27 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
162
179
|
self.report.report_listed_tables_for_namespace(
|
|
163
180
|
".".join(namespace), len(tables)
|
|
164
181
|
)
|
|
165
|
-
yield from tables
|
|
166
|
-
except NoSuchNamespaceError:
|
|
167
|
-
self.report.
|
|
168
|
-
"
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
f"NoSuchNamespaceError exception while trying to get list of tables from namespace {namespace}, skipping it",
|
|
182
|
+
yield from [(table, namespace_urn) for table in tables]
|
|
183
|
+
except NoSuchNamespaceError as e:
|
|
184
|
+
self.report.warning(
|
|
185
|
+
title="No such namespace",
|
|
186
|
+
message="Skipping the missing namespace.",
|
|
187
|
+
context=str(namespace),
|
|
188
|
+
exc=e,
|
|
173
189
|
)
|
|
174
190
|
except Exception as e:
|
|
175
191
|
self.report.report_failure(
|
|
176
|
-
"
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
f"Unexpected exception while trying to get list of tables for namespace {namespace}, skipping it"
|
|
192
|
+
title="Error when processing a namespace",
|
|
193
|
+
message="Skipping the namespace due to errors while processing it.",
|
|
194
|
+
context=str(namespace),
|
|
195
|
+
exc=e,
|
|
181
196
|
)
|
|
182
197
|
|
|
183
198
|
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
184
199
|
thread_local = threading.local()
|
|
185
200
|
|
|
186
201
|
def _try_processing_dataset(
|
|
187
|
-
dataset_path: Tuple[str, ...], dataset_name: str
|
|
202
|
+
dataset_path: Tuple[str, ...], dataset_name: str, namespace_urn: str
|
|
188
203
|
) -> Iterable[MetadataWorkUnit]:
|
|
189
204
|
try:
|
|
190
205
|
if not hasattr(thread_local, "local_catalog"):
|
|
@@ -200,56 +215,66 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
200
215
|
time_taken, dataset_name, table.metadata_location
|
|
201
216
|
)
|
|
202
217
|
LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
218
|
+
dataset_urn: str = make_dataset_urn_with_platform_instance(
|
|
219
|
+
self.platform,
|
|
220
|
+
dataset_name,
|
|
221
|
+
self.config.platform_instance,
|
|
222
|
+
self.config.env,
|
|
208
223
|
)
|
|
209
|
-
|
|
210
|
-
|
|
224
|
+
for aspect in self._create_iceberg_table_aspects(
|
|
225
|
+
dataset_name, table, namespace_urn
|
|
226
|
+
):
|
|
227
|
+
yield MetadataChangeProposalWrapper(
|
|
228
|
+
entityUrn=dataset_urn, aspect=aspect
|
|
229
|
+
).as_workunit()
|
|
230
|
+
except NoSuchPropertyException as e:
|
|
231
|
+
self.report.warning(
|
|
232
|
+
title="Unable to process table",
|
|
233
|
+
message="Table was not processed due to expected property missing (table is probably not an iceberg table).",
|
|
234
|
+
context=dataset_name,
|
|
235
|
+
exc=e,
|
|
211
236
|
)
|
|
212
237
|
except NoSuchIcebergTableError as e:
|
|
213
|
-
self.report.
|
|
214
|
-
"
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
|
|
238
|
+
self.report.warning(
|
|
239
|
+
title="Skipped non-iceberg table",
|
|
240
|
+
message="Table was recognized as non-iceberg and skipped.",
|
|
241
|
+
context=dataset_name,
|
|
242
|
+
exc=e,
|
|
219
243
|
)
|
|
220
244
|
except NoSuchTableError as e:
|
|
221
|
-
self.report.
|
|
222
|
-
"
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
f"NoSuchTableError while processing table {dataset_path}, skipping it.",
|
|
245
|
+
self.report.warning(
|
|
246
|
+
title="Table not found",
|
|
247
|
+
message="Table was returned by the catalog in the list of table but catalog can't find its details, table was skipped.",
|
|
248
|
+
context=dataset_name,
|
|
249
|
+
exc=e,
|
|
227
250
|
)
|
|
228
251
|
except FileNotFoundError as e:
|
|
229
|
-
self.report.
|
|
230
|
-
"file
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
f"FileNotFoundError while processing table {dataset_path}, skipping it."
|
|
252
|
+
self.report.warning(
|
|
253
|
+
title="Manifest file not found",
|
|
254
|
+
message="Couldn't find manifest file to read for the table, skipping it.",
|
|
255
|
+
context=dataset_name,
|
|
256
|
+
exc=e,
|
|
235
257
|
)
|
|
236
258
|
except ServerError as e:
|
|
237
|
-
self.report.
|
|
238
|
-
"
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
|
|
259
|
+
self.report.warning(
|
|
260
|
+
title="Iceberg REST Server Error",
|
|
261
|
+
message="Iceberg returned 500 HTTP status when trying to process a table, skipping it.",
|
|
262
|
+
context=dataset_name,
|
|
263
|
+
exc=e,
|
|
243
264
|
)
|
|
244
265
|
except ValueError as e:
|
|
245
266
|
if "Could not initialize FileIO" not in str(e):
|
|
246
267
|
raise
|
|
247
268
|
self.report.warning(
|
|
248
|
-
"Could not initialize FileIO",
|
|
249
|
-
|
|
269
|
+
title="Could not initialize FileIO",
|
|
270
|
+
message="Could not initialize FileIO for a table (are you using custom FileIO?). Skipping the table.",
|
|
271
|
+
context=dataset_name,
|
|
272
|
+
exc=e,
|
|
250
273
|
)
|
|
251
274
|
|
|
252
|
-
def _process_dataset(
|
|
275
|
+
def _process_dataset(
|
|
276
|
+
dataset_path: Identifier, namespace_urn: str
|
|
277
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
253
278
|
try:
|
|
254
279
|
LOGGER.debug(f"Processing dataset for path {dataset_path}")
|
|
255
280
|
dataset_name = ".".join(dataset_path)
|
|
@@ -261,66 +286,88 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
261
286
|
)
|
|
262
287
|
return
|
|
263
288
|
|
|
264
|
-
yield from _try_processing_dataset(
|
|
289
|
+
yield from _try_processing_dataset(
|
|
290
|
+
dataset_path, dataset_name, namespace_urn
|
|
291
|
+
)
|
|
265
292
|
except Exception as e:
|
|
266
293
|
self.report.report_failure(
|
|
267
|
-
"
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
f"Exception while processing table {dataset_path}, skipping it.",
|
|
294
|
+
title="Error when processing a table",
|
|
295
|
+
message="Skipping the table due to errors when processing it.",
|
|
296
|
+
context=str(dataset_path),
|
|
297
|
+
exc=e,
|
|
272
298
|
)
|
|
273
299
|
|
|
274
300
|
try:
|
|
275
301
|
catalog = self.config.get_catalog()
|
|
276
302
|
except Exception as e:
|
|
277
|
-
self.report.report_failure(
|
|
303
|
+
self.report.report_failure(
|
|
304
|
+
title="Failed to initialize catalog object",
|
|
305
|
+
message="Couldn't start the ingestion due to failure to initialize catalog object.",
|
|
306
|
+
exc=e,
|
|
307
|
+
)
|
|
308
|
+
return
|
|
309
|
+
|
|
310
|
+
try:
|
|
311
|
+
namespace_ids = self._get_namespaces(catalog)
|
|
312
|
+
namespaces: List[Tuple[Identifier, str]] = []
|
|
313
|
+
for namespace in namespace_ids:
|
|
314
|
+
namespace_repr = ".".join(namespace)
|
|
315
|
+
LOGGER.debug(f"Processing namespace {namespace_repr}")
|
|
316
|
+
namespace_urn = make_container_urn(
|
|
317
|
+
NamespaceKey(
|
|
318
|
+
namespace=namespace_repr,
|
|
319
|
+
platform=self.platform,
|
|
320
|
+
instance=self.config.platform_instance,
|
|
321
|
+
env=self.config.env,
|
|
322
|
+
)
|
|
323
|
+
)
|
|
324
|
+
namespaces.append((namespace, namespace_urn))
|
|
325
|
+
for aspect in self._create_iceberg_namespace_aspects(namespace):
|
|
326
|
+
yield MetadataChangeProposalWrapper(
|
|
327
|
+
entityUrn=namespace_urn, aspect=aspect
|
|
328
|
+
).as_workunit()
|
|
329
|
+
LOGGER.debug("Namespaces ingestion completed")
|
|
330
|
+
except Exception as e:
|
|
331
|
+
self.report.report_failure(
|
|
332
|
+
title="Failed to list namespaces",
|
|
333
|
+
message="Couldn't start the ingestion due to a failure to process the list of the namespaces",
|
|
334
|
+
exc=e,
|
|
335
|
+
)
|
|
278
336
|
return
|
|
279
337
|
|
|
280
338
|
for wu in ThreadedIteratorExecutor.process(
|
|
281
339
|
worker_func=_process_dataset,
|
|
282
|
-
args_list=[
|
|
340
|
+
args_list=[
|
|
341
|
+
(dataset_path, namespace_urn)
|
|
342
|
+
for dataset_path, namespace_urn in self._get_datasets(
|
|
343
|
+
catalog, namespaces
|
|
344
|
+
)
|
|
345
|
+
],
|
|
283
346
|
max_workers=self.config.processing_threads,
|
|
284
347
|
):
|
|
285
348
|
yield wu
|
|
286
349
|
|
|
287
|
-
def
|
|
288
|
-
self, dataset_name: str, table: Table
|
|
289
|
-
) -> Iterable[
|
|
350
|
+
def _create_iceberg_table_aspects(
|
|
351
|
+
self, dataset_name: str, table: Table, namespace_urn: str
|
|
352
|
+
) -> Iterable[_Aspect]:
|
|
290
353
|
with PerfTimer() as timer:
|
|
291
354
|
self.report.report_table_scanned(dataset_name)
|
|
292
355
|
LOGGER.debug(f"Processing table {dataset_name}")
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
dataset_name,
|
|
296
|
-
self.config.platform_instance,
|
|
297
|
-
self.config.env,
|
|
298
|
-
)
|
|
299
|
-
yield MetadataChangeProposalWrapper(
|
|
300
|
-
entityUrn=dataset_urn, aspect=Status(removed=False)
|
|
301
|
-
).as_workunit()
|
|
356
|
+
yield Status(removed=False)
|
|
357
|
+
yield SubTypes(typeNames=[DatasetSubTypes.TABLE])
|
|
302
358
|
|
|
303
|
-
|
|
304
|
-
dataset_name, table
|
|
305
|
-
)
|
|
306
|
-
yield MetadataChangeProposalWrapper(
|
|
307
|
-
entityUrn=dataset_urn, aspect=dataset_properties
|
|
308
|
-
).as_workunit()
|
|
359
|
+
yield self._get_dataset_properties_aspect(dataset_name, table)
|
|
309
360
|
|
|
310
361
|
dataset_ownership = self._get_ownership_aspect(table)
|
|
311
362
|
if dataset_ownership:
|
|
312
363
|
LOGGER.debug(
|
|
313
364
|
f"Adding ownership: {dataset_ownership} to the dataset {dataset_name}"
|
|
314
365
|
)
|
|
315
|
-
yield
|
|
316
|
-
entityUrn=dataset_urn, aspect=dataset_ownership
|
|
317
|
-
).as_workunit()
|
|
366
|
+
yield dataset_ownership
|
|
318
367
|
|
|
319
|
-
|
|
320
|
-
yield
|
|
321
|
-
|
|
322
|
-
).as_workunit()
|
|
323
|
-
yield self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
|
|
368
|
+
yield self._create_schema_metadata(dataset_name, table)
|
|
369
|
+
yield self._get_dataplatform_instance_aspect()
|
|
370
|
+
yield ContainerClass(container=str(namespace_urn))
|
|
324
371
|
|
|
325
372
|
self.report.report_table_processing_time(
|
|
326
373
|
timer.elapsed_seconds(), dataset_name, table.metadata_location
|
|
@@ -328,7 +375,7 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
328
375
|
|
|
329
376
|
if self.config.is_profiling_enabled():
|
|
330
377
|
profiler = IcebergProfiler(self.report, self.config.profiling)
|
|
331
|
-
yield from profiler.profile_table(dataset_name,
|
|
378
|
+
yield from profiler.profile_table(dataset_name, table)
|
|
332
379
|
|
|
333
380
|
def _get_partition_aspect(self, table: Table) -> Optional[str]:
|
|
334
381
|
"""Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
|
|
@@ -367,9 +414,11 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
367
414
|
]
|
|
368
415
|
)
|
|
369
416
|
except Exception as e:
|
|
370
|
-
self.report.
|
|
371
|
-
"extract
|
|
372
|
-
|
|
417
|
+
self.report.warning(
|
|
418
|
+
title="Failed to extract partition information",
|
|
419
|
+
message="Failed to extract partition information for a table. Table metadata will be ingested without it.",
|
|
420
|
+
context=str(table.name),
|
|
421
|
+
exc=e,
|
|
373
422
|
)
|
|
374
423
|
return None
|
|
375
424
|
|
|
@@ -435,18 +484,15 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
435
484
|
)
|
|
436
485
|
return OwnershipClass(owners=owners) if owners else None
|
|
437
486
|
|
|
438
|
-
def _get_dataplatform_instance_aspect(self
|
|
439
|
-
return
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
else None,
|
|
448
|
-
),
|
|
449
|
-
).as_workunit()
|
|
487
|
+
def _get_dataplatform_instance_aspect(self) -> DataPlatformInstanceClass:
|
|
488
|
+
return DataPlatformInstanceClass(
|
|
489
|
+
platform=make_data_platform_urn(self.platform),
|
|
490
|
+
instance=make_dataplatform_instance_urn(
|
|
491
|
+
self.platform, self.config.platform_instance
|
|
492
|
+
)
|
|
493
|
+
if self.config.platform_instance
|
|
494
|
+
else None,
|
|
495
|
+
)
|
|
450
496
|
|
|
451
497
|
def _create_schema_metadata(
|
|
452
498
|
self, dataset_name: str, table: Table
|
|
@@ -475,6 +521,17 @@ class IcebergSource(StatefulIngestionSourceBase):
|
|
|
475
521
|
def get_report(self) -> SourceReport:
|
|
476
522
|
return self.report
|
|
477
523
|
|
|
524
|
+
def _create_iceberg_namespace_aspects(
|
|
525
|
+
self, namespace: Identifier
|
|
526
|
+
) -> Iterable[_Aspect]:
|
|
527
|
+
namespace_repr = ".".join(namespace)
|
|
528
|
+
yield Status(removed=False)
|
|
529
|
+
yield ContainerProperties(
|
|
530
|
+
name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
|
|
531
|
+
)
|
|
532
|
+
yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
|
|
533
|
+
yield self._get_dataplatform_instance_aspect()
|
|
534
|
+
|
|
478
535
|
|
|
479
536
|
class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
|
|
480
537
|
"""Implementation of a visitor to build an Avro schema as a dictionary from an Iceberg schema."""
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Callable, Dict, Iterable,
|
|
2
|
+
from typing import Any, Callable, Dict, Iterable, Optional, cast
|
|
3
3
|
|
|
4
4
|
from pyiceberg.conversions import from_bytes
|
|
5
5
|
from pyiceberg.schema import Schema
|
|
@@ -24,8 +24,6 @@ from pyiceberg.utils.datetime import (
|
|
|
24
24
|
)
|
|
25
25
|
|
|
26
26
|
from datahub.emitter.mce_builder import get_sys_time
|
|
27
|
-
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
28
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
29
27
|
from datahub.ingestion.source.iceberg.iceberg_common import (
|
|
30
28
|
IcebergProfilingConfig,
|
|
31
29
|
IcebergSourceReport,
|
|
@@ -33,6 +31,7 @@ from datahub.ingestion.source.iceberg.iceberg_common import (
|
|
|
33
31
|
from datahub.metadata.schema_classes import (
|
|
34
32
|
DatasetFieldProfileClass,
|
|
35
33
|
DatasetProfileClass,
|
|
34
|
+
_Aspect,
|
|
36
35
|
)
|
|
37
36
|
from datahub.utilities.perf_timer import PerfTimer
|
|
38
37
|
|
|
@@ -86,9 +85,8 @@ class IcebergProfiler:
|
|
|
86
85
|
def profile_table(
|
|
87
86
|
self,
|
|
88
87
|
dataset_name: str,
|
|
89
|
-
dataset_urn: str,
|
|
90
88
|
table: Table,
|
|
91
|
-
) -> Iterable[
|
|
89
|
+
) -> Iterable[_Aspect]:
|
|
92
90
|
"""This method will profile the supplied Iceberg table by looking at the table's manifest.
|
|
93
91
|
|
|
94
92
|
The overall profile of the table is aggregated from the individual manifest files.
|
|
@@ -167,11 +165,11 @@ class IcebergProfiler:
|
|
|
167
165
|
)
|
|
168
166
|
total_count += data_file.record_count
|
|
169
167
|
except Exception as e:
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
168
|
+
self.report.warning(
|
|
169
|
+
title="Error when profiling a table",
|
|
170
|
+
message="Skipping profiling of the table due to errors",
|
|
171
|
+
context=dataset_name,
|
|
172
|
+
exc=e,
|
|
175
173
|
)
|
|
176
174
|
if row_count:
|
|
177
175
|
# Iterating through fieldPaths introduces unwanted stats for list element fields...
|
|
@@ -211,14 +209,11 @@ class IcebergProfiler:
|
|
|
211
209
|
f"Finished profiling of dataset: {dataset_name} in {time_taken}"
|
|
212
210
|
)
|
|
213
211
|
|
|
214
|
-
yield
|
|
215
|
-
entityUrn=dataset_urn,
|
|
216
|
-
aspect=dataset_profile,
|
|
217
|
-
).as_workunit()
|
|
212
|
+
yield dataset_profile
|
|
218
213
|
|
|
219
214
|
def _render_value(
|
|
220
215
|
self, dataset_name: str, value_type: IcebergType, value: Any
|
|
221
|
-
) ->
|
|
216
|
+
) -> Optional[str]:
|
|
222
217
|
try:
|
|
223
218
|
if isinstance(value_type, TimestampType):
|
|
224
219
|
return to_human_timestamp(value)
|
|
@@ -230,9 +225,17 @@ class IcebergProfiler:
|
|
|
230
225
|
return to_human_time(value)
|
|
231
226
|
return str(value)
|
|
232
227
|
except Exception as e:
|
|
233
|
-
self.report.
|
|
234
|
-
"profiling",
|
|
235
|
-
|
|
228
|
+
self.report.warning(
|
|
229
|
+
title="Couldn't render value when profiling a table",
|
|
230
|
+
message="Encountered error, when trying to redner a value for table profile.",
|
|
231
|
+
context=str(
|
|
232
|
+
{
|
|
233
|
+
"value": value,
|
|
234
|
+
"value_type": value_type,
|
|
235
|
+
"dataset_name": dataset_name,
|
|
236
|
+
}
|
|
237
|
+
),
|
|
238
|
+
exc=e,
|
|
236
239
|
)
|
|
237
240
|
return None
|
|
238
241
|
|
|
@@ -11,6 +11,9 @@ import datahub.emitter.mce_builder as builder
|
|
|
11
11
|
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
12
12
|
from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
|
|
13
13
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
14
|
+
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
15
|
+
IncrementalLineageConfigMixin,
|
|
16
|
+
)
|
|
14
17
|
from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
|
|
15
18
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
16
19
|
StaleEntityRemovalSourceReport,
|
|
@@ -19,6 +22,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
19
22
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
20
23
|
StatefulIngestionConfigBase,
|
|
21
24
|
)
|
|
25
|
+
from datahub.utilities.global_warning_util import add_global_warning
|
|
22
26
|
from datahub.utilities.lossy_collections import LossyList
|
|
23
27
|
from datahub.utilities.perf_timer import PerfTimer
|
|
24
28
|
|
|
@@ -183,6 +187,11 @@ class SupportedDataPlatform(Enum):
|
|
|
183
187
|
datahub_data_platform_name="databricks",
|
|
184
188
|
)
|
|
185
189
|
|
|
190
|
+
MYSQL = DataPlatformPair(
|
|
191
|
+
powerbi_data_platform_name="MySQL",
|
|
192
|
+
datahub_data_platform_name="mysql",
|
|
193
|
+
)
|
|
194
|
+
|
|
186
195
|
|
|
187
196
|
@dataclass
|
|
188
197
|
class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
|
|
@@ -275,7 +284,7 @@ class PowerBiProfilingConfig(ConfigModel):
|
|
|
275
284
|
|
|
276
285
|
|
|
277
286
|
class PowerBiDashboardSourceConfig(
|
|
278
|
-
StatefulIngestionConfigBase, DatasetSourceConfigMixin
|
|
287
|
+
StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
|
|
279
288
|
):
|
|
280
289
|
platform_name: str = pydantic.Field(
|
|
281
290
|
default=Constant.PLATFORM_NAME, hidden_from_docs=True
|
|
@@ -297,7 +306,15 @@ class PowerBiDashboardSourceConfig(
|
|
|
297
306
|
# PowerBi workspace identifier
|
|
298
307
|
workspace_id_pattern: AllowDenyPattern = pydantic.Field(
|
|
299
308
|
default=AllowDenyPattern.allow_all(),
|
|
300
|
-
description="Regex patterns to filter PowerBI workspaces in ingestion."
|
|
309
|
+
description="Regex patterns to filter PowerBI workspaces in ingestion by ID."
|
|
310
|
+
" By default all IDs are allowed unless they are filtered by name using 'workspace_name_pattern'."
|
|
311
|
+
" Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
|
|
312
|
+
)
|
|
313
|
+
# PowerBi workspace name
|
|
314
|
+
workspace_name_pattern: AllowDenyPattern = pydantic.Field(
|
|
315
|
+
default=AllowDenyPattern.allow_all(),
|
|
316
|
+
description="Regex patterns to filter PowerBI workspaces in ingestion by name."
|
|
317
|
+
" By default all names are allowed unless they are filtered by ID using 'workspace_id_pattern'."
|
|
301
318
|
" Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
|
|
302
319
|
)
|
|
303
320
|
|
|
@@ -373,8 +390,9 @@ class PowerBiDashboardSourceConfig(
|
|
|
373
390
|
)
|
|
374
391
|
# Enable/Disable extracting dataset schema
|
|
375
392
|
extract_dataset_schema: bool = pydantic.Field(
|
|
376
|
-
default=
|
|
377
|
-
description="Whether to ingest PBI Dataset Table columns and measures"
|
|
393
|
+
default=True,
|
|
394
|
+
description="Whether to ingest PBI Dataset Table columns and measures."
|
|
395
|
+
" Note: this setting must be `true` for schema extraction and column lineage to be enabled.",
|
|
378
396
|
)
|
|
379
397
|
# Enable/Disable extracting lineage information of PowerBI Dataset
|
|
380
398
|
extract_lineage: bool = pydantic.Field(
|
|
@@ -510,6 +528,7 @@ class PowerBiDashboardSourceConfig(
|
|
|
510
528
|
"native_query_parsing",
|
|
511
529
|
"enable_advance_lineage_sql_construct",
|
|
512
530
|
"extract_lineage",
|
|
531
|
+
"extract_dataset_schema",
|
|
513
532
|
]
|
|
514
533
|
|
|
515
534
|
if (
|
|
@@ -575,3 +594,11 @@ class PowerBiDashboardSourceConfig(
|
|
|
575
594
|
)
|
|
576
595
|
|
|
577
596
|
return values
|
|
597
|
+
|
|
598
|
+
@root_validator(skip_on_failure=True)
|
|
599
|
+
def validate_extract_dataset_schema(cls, values: Dict) -> Dict:
|
|
600
|
+
if values.get("extract_dataset_schema") is False:
|
|
601
|
+
add_global_warning(
|
|
602
|
+
"Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
|
|
603
|
+
)
|
|
604
|
+
return values
|