acryl-datahub 1.0.0.1rc5__py3-none-any.whl → 1.0.0.1rc6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -38,6 +38,7 @@ from pyiceberg.types import (
38
38
  )
39
39
 
40
40
  from datahub.emitter.mce_builder import (
41
+ make_container_urn,
41
42
  make_data_platform_urn,
42
43
  make_dataplatform_instance_urn,
43
44
  make_dataset_urn_with_platform_instance,
@@ -45,6 +46,7 @@ from datahub.emitter.mce_builder import (
45
46
  make_user_urn,
46
47
  )
47
48
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
49
+ from datahub.emitter.mcp_builder import NamespaceKey
48
50
  from datahub.ingestion.api.common import PipelineContext
49
51
  from datahub.ingestion.api.decorators import (
50
52
  SourceCapability,
@@ -57,6 +59,10 @@ from datahub.ingestion.api.decorators import (
57
59
  from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
58
60
  from datahub.ingestion.api.workunit import MetadataWorkUnit
59
61
  from datahub.ingestion.extractor import schema_util
62
+ from datahub.ingestion.source.common.subtypes import (
63
+ DatasetContainerSubTypes,
64
+ DatasetSubTypes,
65
+ )
60
66
  from datahub.ingestion.source.iceberg.iceberg_common import (
61
67
  IcebergSourceConfig,
62
68
  IcebergSourceReport,
@@ -68,19 +74,22 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
68
74
  from datahub.ingestion.source.state.stateful_ingestion_base import (
69
75
  StatefulIngestionSourceBase,
70
76
  )
71
- from datahub.metadata.com.linkedin.pegasus2avro.common import Status
77
+ from datahub.metadata.com.linkedin.pegasus2avro.common import Status, SubTypes
78
+ from datahub.metadata.com.linkedin.pegasus2avro.container import ContainerProperties
72
79
  from datahub.metadata.com.linkedin.pegasus2avro.schema import (
73
80
  OtherSchema,
74
81
  SchemaField,
75
82
  SchemaMetadata,
76
83
  )
77
84
  from datahub.metadata.schema_classes import (
85
+ ContainerClass,
78
86
  DataPlatformInstanceClass,
79
87
  DatasetPropertiesClass,
80
88
  OwnerClass,
81
89
  OwnershipClass,
82
90
  OwnershipTypeClass,
83
91
  TimeStampClass,
92
+ _Aspect,
84
93
  )
85
94
  from datahub.utilities.perf_timer import PerfTimer
86
95
  from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor
@@ -119,9 +128,10 @@ class IcebergSource(StatefulIngestionSourceBase):
119
128
  [pyiceberg library](https://py.iceberg.apache.org/).
120
129
  """
121
130
 
131
+ platform: str = "iceberg"
132
+
122
133
  def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None:
123
134
  super().__init__(config, ctx)
124
- self.platform: str = "iceberg"
125
135
  self.report: IcebergSourceReport = IcebergSourceReport()
126
136
  self.config: IcebergSourceConfig = config
127
137
 
@@ -138,13 +148,12 @@ class IcebergSource(StatefulIngestionSourceBase):
138
148
  ).workunit_processor,
139
149
  ]
140
150
 
141
- def _get_datasets(self, catalog: Catalog) -> Iterable[Identifier]:
151
+ def _get_namespaces(self, catalog: Catalog) -> Iterable[Identifier]:
142
152
  namespaces = catalog.list_namespaces()
143
153
  LOGGER.debug(
144
154
  f"Retrieved {len(namespaces)} namespaces, first 10: {namespaces[:10]}"
145
155
  )
146
156
  self.report.report_no_listed_namespaces(len(namespaces))
147
- tables_count = 0
148
157
  for namespace in namespaces:
149
158
  namespace_repr = ".".join(namespace)
150
159
  if not self.config.namespace_pattern.allowed(namespace_repr):
@@ -153,6 +162,14 @@ class IcebergSource(StatefulIngestionSourceBase):
153
162
  )
154
163
  self.report.report_dropped(f"{namespace_repr}.*")
155
164
  continue
165
+ yield namespace
166
+
167
+ def _get_datasets(
168
+ self, catalog: Catalog, namespaces: Iterable[Tuple[Identifier, str]]
169
+ ) -> Iterable[Tuple[Identifier, str]]:
170
+ LOGGER.debug("Starting to retrieve tables")
171
+ tables_count = 0
172
+ for namespace, namespace_urn in namespaces:
156
173
  try:
157
174
  tables = catalog.list_tables(namespace)
158
175
  tables_count += len(tables)
@@ -162,29 +179,27 @@ class IcebergSource(StatefulIngestionSourceBase):
162
179
  self.report.report_listed_tables_for_namespace(
163
180
  ".".join(namespace), len(tables)
164
181
  )
165
- yield from tables
166
- except NoSuchNamespaceError:
167
- self.report.report_warning(
168
- "no-such-namespace",
169
- f"Couldn't list tables for namespace {namespace} due to NoSuchNamespaceError exception",
170
- )
171
- LOGGER.warning(
172
- f"NoSuchNamespaceError exception while trying to get list of tables from namespace {namespace}, skipping it",
182
+ yield from [(table, namespace_urn) for table in tables]
183
+ except NoSuchNamespaceError as e:
184
+ self.report.warning(
185
+ title="No such namespace",
186
+ message="Skipping the missing namespace.",
187
+ context=str(namespace),
188
+ exc=e,
173
189
  )
174
190
  except Exception as e:
175
191
  self.report.report_failure(
176
- "listing-tables-exception",
177
- f"Couldn't list tables for namespace {namespace} due to {e}",
178
- )
179
- LOGGER.exception(
180
- f"Unexpected exception while trying to get list of tables for namespace {namespace}, skipping it"
192
+ title="Error when processing a namespace",
193
+ message="Skipping the namespace due to errors while processing it.",
194
+ context=str(namespace),
195
+ exc=e,
181
196
  )
182
197
 
183
198
  def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
184
199
  thread_local = threading.local()
185
200
 
186
201
  def _try_processing_dataset(
187
- dataset_path: Tuple[str, ...], dataset_name: str
202
+ dataset_path: Tuple[str, ...], dataset_name: str, namespace_urn: str
188
203
  ) -> Iterable[MetadataWorkUnit]:
189
204
  try:
190
205
  if not hasattr(thread_local, "local_catalog"):
@@ -200,56 +215,66 @@ class IcebergSource(StatefulIngestionSourceBase):
200
215
  time_taken, dataset_name, table.metadata_location
201
216
  )
202
217
  LOGGER.debug(f"Loaded table: {table.name()}, time taken: {time_taken}")
203
- yield from self._create_iceberg_workunit(dataset_name, table)
204
- except NoSuchPropertyException as e:
205
- self.report.report_warning(
206
- "table-property-missing",
207
- f"Failed to create workunit for {dataset_name}. {e}",
218
+ dataset_urn: str = make_dataset_urn_with_platform_instance(
219
+ self.platform,
220
+ dataset_name,
221
+ self.config.platform_instance,
222
+ self.config.env,
208
223
  )
209
- LOGGER.warning(
210
- f"NoSuchPropertyException while processing table {dataset_path}, skipping it.",
224
+ for aspect in self._create_iceberg_table_aspects(
225
+ dataset_name, table, namespace_urn
226
+ ):
227
+ yield MetadataChangeProposalWrapper(
228
+ entityUrn=dataset_urn, aspect=aspect
229
+ ).as_workunit()
230
+ except NoSuchPropertyException as e:
231
+ self.report.warning(
232
+ title="Unable to process table",
233
+ message="Table was not processed due to expected property missing (table is probably not an iceberg table).",
234
+ context=dataset_name,
235
+ exc=e,
211
236
  )
212
237
  except NoSuchIcebergTableError as e:
213
- self.report.report_warning(
214
- "not-an-iceberg-table",
215
- f"Failed to create workunit for {dataset_name}. {e}",
216
- )
217
- LOGGER.warning(
218
- f"NoSuchIcebergTableError while processing table {dataset_path}, skipping it.",
238
+ self.report.warning(
239
+ title="Skipped non-iceberg table",
240
+ message="Table was recognized as non-iceberg and skipped.",
241
+ context=dataset_name,
242
+ exc=e,
219
243
  )
220
244
  except NoSuchTableError as e:
221
- self.report.report_warning(
222
- "no-such-table",
223
- f"Failed to create workunit for {dataset_name}. {e}",
224
- )
225
- LOGGER.warning(
226
- f"NoSuchTableError while processing table {dataset_path}, skipping it.",
245
+ self.report.warning(
246
+ title="Table not found",
247
+ message="Table was returned by the catalog in the list of table but catalog can't find its details, table was skipped.",
248
+ context=dataset_name,
249
+ exc=e,
227
250
  )
228
251
  except FileNotFoundError as e:
229
- self.report.report_warning(
230
- "file-not-found",
231
- f"Encountered FileNotFoundError when trying to read manifest file for {dataset_name}. {e}",
232
- )
233
- LOGGER.warning(
234
- f"FileNotFoundError while processing table {dataset_path}, skipping it."
252
+ self.report.warning(
253
+ title="Manifest file not found",
254
+ message="Couldn't find manifest file to read for the table, skipping it.",
255
+ context=dataset_name,
256
+ exc=e,
235
257
  )
236
258
  except ServerError as e:
237
- self.report.report_warning(
238
- "iceberg-rest-server-error",
239
- f"Iceberg Rest Catalog returned 500 status due to an unhandled exception for {dataset_name}. Exception: {e}",
240
- )
241
- LOGGER.warning(
242
- f"Iceberg Rest Catalog server error (500 status) encountered when processing table {dataset_path}, skipping it."
259
+ self.report.warning(
260
+ title="Iceberg REST Server Error",
261
+ message="Iceberg returned 500 HTTP status when trying to process a table, skipping it.",
262
+ context=dataset_name,
263
+ exc=e,
243
264
  )
244
265
  except ValueError as e:
245
266
  if "Could not initialize FileIO" not in str(e):
246
267
  raise
247
268
  self.report.warning(
248
- "Could not initialize FileIO",
249
- f"Could not initialize FileIO for {dataset_path} due to: {e}",
269
+ title="Could not initialize FileIO",
270
+ message="Could not initialize FileIO for a table (are you using custom FileIO?). Skipping the table.",
271
+ context=dataset_name,
272
+ exc=e,
250
273
  )
251
274
 
252
- def _process_dataset(dataset_path: Identifier) -> Iterable[MetadataWorkUnit]:
275
+ def _process_dataset(
276
+ dataset_path: Identifier, namespace_urn: str
277
+ ) -> Iterable[MetadataWorkUnit]:
253
278
  try:
254
279
  LOGGER.debug(f"Processing dataset for path {dataset_path}")
255
280
  dataset_name = ".".join(dataset_path)
@@ -261,66 +286,88 @@ class IcebergSource(StatefulIngestionSourceBase):
261
286
  )
262
287
  return
263
288
 
264
- yield from _try_processing_dataset(dataset_path, dataset_name)
289
+ yield from _try_processing_dataset(
290
+ dataset_path, dataset_name, namespace_urn
291
+ )
265
292
  except Exception as e:
266
293
  self.report.report_failure(
267
- "general",
268
- f"Failed to create workunit for dataset {dataset_path}: {e}",
269
- )
270
- LOGGER.exception(
271
- f"Exception while processing table {dataset_path}, skipping it.",
294
+ title="Error when processing a table",
295
+ message="Skipping the table due to errors when processing it.",
296
+ context=str(dataset_path),
297
+ exc=e,
272
298
  )
273
299
 
274
300
  try:
275
301
  catalog = self.config.get_catalog()
276
302
  except Exception as e:
277
- self.report.report_failure("get-catalog", f"Failed to get catalog: {e}")
303
+ self.report.report_failure(
304
+ title="Failed to initialize catalog object",
305
+ message="Couldn't start the ingestion due to failure to initialize catalog object.",
306
+ exc=e,
307
+ )
308
+ return
309
+
310
+ try:
311
+ namespace_ids = self._get_namespaces(catalog)
312
+ namespaces: List[Tuple[Identifier, str]] = []
313
+ for namespace in namespace_ids:
314
+ namespace_repr = ".".join(namespace)
315
+ LOGGER.debug(f"Processing namespace {namespace_repr}")
316
+ namespace_urn = make_container_urn(
317
+ NamespaceKey(
318
+ namespace=namespace_repr,
319
+ platform=self.platform,
320
+ instance=self.config.platform_instance,
321
+ env=self.config.env,
322
+ )
323
+ )
324
+ namespaces.append((namespace, namespace_urn))
325
+ for aspect in self._create_iceberg_namespace_aspects(namespace):
326
+ yield MetadataChangeProposalWrapper(
327
+ entityUrn=namespace_urn, aspect=aspect
328
+ ).as_workunit()
329
+ LOGGER.debug("Namespaces ingestion completed")
330
+ except Exception as e:
331
+ self.report.report_failure(
332
+ title="Failed to list namespaces",
333
+ message="Couldn't start the ingestion due to a failure to process the list of the namespaces",
334
+ exc=e,
335
+ )
278
336
  return
279
337
 
280
338
  for wu in ThreadedIteratorExecutor.process(
281
339
  worker_func=_process_dataset,
282
- args_list=[(dataset_path,) for dataset_path in self._get_datasets(catalog)],
340
+ args_list=[
341
+ (dataset_path, namespace_urn)
342
+ for dataset_path, namespace_urn in self._get_datasets(
343
+ catalog, namespaces
344
+ )
345
+ ],
283
346
  max_workers=self.config.processing_threads,
284
347
  ):
285
348
  yield wu
286
349
 
287
- def _create_iceberg_workunit(
288
- self, dataset_name: str, table: Table
289
- ) -> Iterable[MetadataWorkUnit]:
350
+ def _create_iceberg_table_aspects(
351
+ self, dataset_name: str, table: Table, namespace_urn: str
352
+ ) -> Iterable[_Aspect]:
290
353
  with PerfTimer() as timer:
291
354
  self.report.report_table_scanned(dataset_name)
292
355
  LOGGER.debug(f"Processing table {dataset_name}")
293
- dataset_urn: str = make_dataset_urn_with_platform_instance(
294
- self.platform,
295
- dataset_name,
296
- self.config.platform_instance,
297
- self.config.env,
298
- )
299
- yield MetadataChangeProposalWrapper(
300
- entityUrn=dataset_urn, aspect=Status(removed=False)
301
- ).as_workunit()
356
+ yield Status(removed=False)
357
+ yield SubTypes(typeNames=[DatasetSubTypes.TABLE])
302
358
 
303
- dataset_properties = self._get_dataset_properties_aspect(
304
- dataset_name, table
305
- )
306
- yield MetadataChangeProposalWrapper(
307
- entityUrn=dataset_urn, aspect=dataset_properties
308
- ).as_workunit()
359
+ yield self._get_dataset_properties_aspect(dataset_name, table)
309
360
 
310
361
  dataset_ownership = self._get_ownership_aspect(table)
311
362
  if dataset_ownership:
312
363
  LOGGER.debug(
313
364
  f"Adding ownership: {dataset_ownership} to the dataset {dataset_name}"
314
365
  )
315
- yield MetadataChangeProposalWrapper(
316
- entityUrn=dataset_urn, aspect=dataset_ownership
317
- ).as_workunit()
366
+ yield dataset_ownership
318
367
 
319
- schema_metadata = self._create_schema_metadata(dataset_name, table)
320
- yield MetadataChangeProposalWrapper(
321
- entityUrn=dataset_urn, aspect=schema_metadata
322
- ).as_workunit()
323
- yield self._get_dataplatform_instance_aspect(dataset_urn=dataset_urn)
368
+ yield self._create_schema_metadata(dataset_name, table)
369
+ yield self._get_dataplatform_instance_aspect()
370
+ yield ContainerClass(container=str(namespace_urn))
324
371
 
325
372
  self.report.report_table_processing_time(
326
373
  timer.elapsed_seconds(), dataset_name, table.metadata_location
@@ -328,7 +375,7 @@ class IcebergSource(StatefulIngestionSourceBase):
328
375
 
329
376
  if self.config.is_profiling_enabled():
330
377
  profiler = IcebergProfiler(self.report, self.config.profiling)
331
- yield from profiler.profile_table(dataset_name, dataset_urn, table)
378
+ yield from profiler.profile_table(dataset_name, table)
332
379
 
333
380
  def _get_partition_aspect(self, table: Table) -> Optional[str]:
334
381
  """Extracts partition information from the provided table and returns a JSON array representing the [partition spec](https://iceberg.apache.org/spec/?#partition-specs) of the table.
@@ -367,9 +414,11 @@ class IcebergSource(StatefulIngestionSourceBase):
367
414
  ]
368
415
  )
369
416
  except Exception as e:
370
- self.report.report_warning(
371
- "extract-partition",
372
- f"Failed to extract partition spec from Iceberg table {table.name()} due to error: {str(e)}",
417
+ self.report.warning(
418
+ title="Failed to extract partition information",
419
+ message="Failed to extract partition information for a table. Table metadata will be ingested without it.",
420
+ context=str(table.name),
421
+ exc=e,
373
422
  )
374
423
  return None
375
424
 
@@ -435,18 +484,15 @@ class IcebergSource(StatefulIngestionSourceBase):
435
484
  )
436
485
  return OwnershipClass(owners=owners) if owners else None
437
486
 
438
- def _get_dataplatform_instance_aspect(self, dataset_urn: str) -> MetadataWorkUnit:
439
- return MetadataChangeProposalWrapper(
440
- entityUrn=dataset_urn,
441
- aspect=DataPlatformInstanceClass(
442
- platform=make_data_platform_urn(self.platform),
443
- instance=make_dataplatform_instance_urn(
444
- self.platform, self.config.platform_instance
445
- )
446
- if self.config.platform_instance
447
- else None,
448
- ),
449
- ).as_workunit()
487
+ def _get_dataplatform_instance_aspect(self) -> DataPlatformInstanceClass:
488
+ return DataPlatformInstanceClass(
489
+ platform=make_data_platform_urn(self.platform),
490
+ instance=make_dataplatform_instance_urn(
491
+ self.platform, self.config.platform_instance
492
+ )
493
+ if self.config.platform_instance
494
+ else None,
495
+ )
450
496
 
451
497
  def _create_schema_metadata(
452
498
  self, dataset_name: str, table: Table
@@ -475,6 +521,17 @@ class IcebergSource(StatefulIngestionSourceBase):
475
521
  def get_report(self) -> SourceReport:
476
522
  return self.report
477
523
 
524
+ def _create_iceberg_namespace_aspects(
525
+ self, namespace: Identifier
526
+ ) -> Iterable[_Aspect]:
527
+ namespace_repr = ".".join(namespace)
528
+ yield Status(removed=False)
529
+ yield ContainerProperties(
530
+ name=namespace_repr, qualifiedName=namespace_repr, env=self.config.env
531
+ )
532
+ yield SubTypes(typeNames=[DatasetContainerSubTypes.NAMESPACE])
533
+ yield self._get_dataplatform_instance_aspect()
534
+
478
535
 
479
536
  class ToAvroSchemaIcebergVisitor(SchemaVisitorPerPrimitiveType[Dict[str, Any]]):
480
537
  """Implementation of a visitor to build an Avro schema as a dictionary from an Iceberg schema."""
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any, Callable, Dict, Iterable, Union, cast
2
+ from typing import Any, Callable, Dict, Iterable, Optional, cast
3
3
 
4
4
  from pyiceberg.conversions import from_bytes
5
5
  from pyiceberg.schema import Schema
@@ -24,8 +24,6 @@ from pyiceberg.utils.datetime import (
24
24
  )
25
25
 
26
26
  from datahub.emitter.mce_builder import get_sys_time
27
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
28
- from datahub.ingestion.api.workunit import MetadataWorkUnit
29
27
  from datahub.ingestion.source.iceberg.iceberg_common import (
30
28
  IcebergProfilingConfig,
31
29
  IcebergSourceReport,
@@ -33,6 +31,7 @@ from datahub.ingestion.source.iceberg.iceberg_common import (
33
31
  from datahub.metadata.schema_classes import (
34
32
  DatasetFieldProfileClass,
35
33
  DatasetProfileClass,
34
+ _Aspect,
36
35
  )
37
36
  from datahub.utilities.perf_timer import PerfTimer
38
37
 
@@ -86,9 +85,8 @@ class IcebergProfiler:
86
85
  def profile_table(
87
86
  self,
88
87
  dataset_name: str,
89
- dataset_urn: str,
90
88
  table: Table,
91
- ) -> Iterable[MetadataWorkUnit]:
89
+ ) -> Iterable[_Aspect]:
92
90
  """This method will profile the supplied Iceberg table by looking at the table's manifest.
93
91
 
94
92
  The overall profile of the table is aggregated from the individual manifest files.
@@ -167,11 +165,11 @@ class IcebergProfiler:
167
165
  )
168
166
  total_count += data_file.record_count
169
167
  except Exception as e:
170
- # Catch any errors that arise from attempting to read the Iceberg table's manifests
171
- # This will prevent stateful ingestion from being blocked by an error (profiling is not critical)
172
- self.report.report_warning(
173
- "profiling",
174
- f"Error while profiling dataset {dataset_name}: {e}",
168
+ self.report.warning(
169
+ title="Error when profiling a table",
170
+ message="Skipping profiling of the table due to errors",
171
+ context=dataset_name,
172
+ exc=e,
175
173
  )
176
174
  if row_count:
177
175
  # Iterating through fieldPaths introduces unwanted stats for list element fields...
@@ -211,14 +209,11 @@ class IcebergProfiler:
211
209
  f"Finished profiling of dataset: {dataset_name} in {time_taken}"
212
210
  )
213
211
 
214
- yield MetadataChangeProposalWrapper(
215
- entityUrn=dataset_urn,
216
- aspect=dataset_profile,
217
- ).as_workunit()
212
+ yield dataset_profile
218
213
 
219
214
  def _render_value(
220
215
  self, dataset_name: str, value_type: IcebergType, value: Any
221
- ) -> Union[str, None]:
216
+ ) -> Optional[str]:
222
217
  try:
223
218
  if isinstance(value_type, TimestampType):
224
219
  return to_human_timestamp(value)
@@ -230,9 +225,17 @@ class IcebergProfiler:
230
225
  return to_human_time(value)
231
226
  return str(value)
232
227
  except Exception as e:
233
- self.report.report_warning(
234
- "profiling",
235
- f"Error in dataset {dataset_name} when profiling a {value_type} field with value {value}: {e}",
228
+ self.report.warning(
229
+ title="Couldn't render value when profiling a table",
230
+ message="Encountered error, when trying to redner a value for table profile.",
231
+ context=str(
232
+ {
233
+ "value": value,
234
+ "value_type": value_type,
235
+ "dataset_name": dataset_name,
236
+ }
237
+ ),
238
+ exc=e,
236
239
  )
237
240
  return None
238
241
 
@@ -11,6 +11,9 @@ import datahub.emitter.mce_builder as builder
11
11
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
12
12
  from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
13
13
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
14
+ from datahub.ingestion.api.incremental_lineage_helper import (
15
+ IncrementalLineageConfigMixin,
16
+ )
14
17
  from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
15
18
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
16
19
  StaleEntityRemovalSourceReport,
@@ -19,6 +22,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
19
22
  from datahub.ingestion.source.state.stateful_ingestion_base import (
20
23
  StatefulIngestionConfigBase,
21
24
  )
25
+ from datahub.utilities.global_warning_util import add_global_warning
22
26
  from datahub.utilities.lossy_collections import LossyList
23
27
  from datahub.utilities.perf_timer import PerfTimer
24
28
 
@@ -183,6 +187,11 @@ class SupportedDataPlatform(Enum):
183
187
  datahub_data_platform_name="databricks",
184
188
  )
185
189
 
190
+ MYSQL = DataPlatformPair(
191
+ powerbi_data_platform_name="MySQL",
192
+ datahub_data_platform_name="mysql",
193
+ )
194
+
186
195
 
187
196
  @dataclass
188
197
  class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
@@ -275,7 +284,7 @@ class PowerBiProfilingConfig(ConfigModel):
275
284
 
276
285
 
277
286
  class PowerBiDashboardSourceConfig(
278
- StatefulIngestionConfigBase, DatasetSourceConfigMixin
287
+ StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
279
288
  ):
280
289
  platform_name: str = pydantic.Field(
281
290
  default=Constant.PLATFORM_NAME, hidden_from_docs=True
@@ -297,7 +306,15 @@ class PowerBiDashboardSourceConfig(
297
306
  # PowerBi workspace identifier
298
307
  workspace_id_pattern: AllowDenyPattern = pydantic.Field(
299
308
  default=AllowDenyPattern.allow_all(),
300
- description="Regex patterns to filter PowerBI workspaces in ingestion."
309
+ description="Regex patterns to filter PowerBI workspaces in ingestion by ID."
310
+ " By default all IDs are allowed unless they are filtered by name using 'workspace_name_pattern'."
311
+ " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
312
+ )
313
+ # PowerBi workspace name
314
+ workspace_name_pattern: AllowDenyPattern = pydantic.Field(
315
+ default=AllowDenyPattern.allow_all(),
316
+ description="Regex patterns to filter PowerBI workspaces in ingestion by name."
317
+ " By default all names are allowed unless they are filtered by ID using 'workspace_id_pattern'."
301
318
  " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
302
319
  )
303
320
 
@@ -373,8 +390,9 @@ class PowerBiDashboardSourceConfig(
373
390
  )
374
391
  # Enable/Disable extracting dataset schema
375
392
  extract_dataset_schema: bool = pydantic.Field(
376
- default=False,
377
- description="Whether to ingest PBI Dataset Table columns and measures",
393
+ default=True,
394
+ description="Whether to ingest PBI Dataset Table columns and measures."
395
+ " Note: this setting must be `true` for schema extraction and column lineage to be enabled.",
378
396
  )
379
397
  # Enable/Disable extracting lineage information of PowerBI Dataset
380
398
  extract_lineage: bool = pydantic.Field(
@@ -510,6 +528,7 @@ class PowerBiDashboardSourceConfig(
510
528
  "native_query_parsing",
511
529
  "enable_advance_lineage_sql_construct",
512
530
  "extract_lineage",
531
+ "extract_dataset_schema",
513
532
  ]
514
533
 
515
534
  if (
@@ -575,3 +594,11 @@ class PowerBiDashboardSourceConfig(
575
594
  )
576
595
 
577
596
  return values
597
+
598
+ @root_validator(skip_on_failure=True)
599
+ def validate_extract_dataset_schema(cls, values: Dict) -> Dict:
600
+ if values.get("extract_dataset_schema") is False:
601
+ add_global_warning(
602
+ "Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
603
+ )
604
+ return values
@@ -74,3 +74,4 @@ class FunctionName(Enum):
74
74
  GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
75
75
  AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
76
76
  DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
77
+ MYSQL_DATA_ACCESS = "MySQL.Database"