acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (106) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2391 -2392
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +105 -88
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/cli/specific/dataset_cli.py +26 -10
  8. datahub/emitter/mce_builder.py +1 -3
  9. datahub/emitter/mcp_builder.py +8 -0
  10. datahub/emitter/request_helper.py +19 -14
  11. datahub/emitter/response_helper.py +25 -18
  12. datahub/emitter/rest_emitter.py +23 -7
  13. datahub/errors.py +8 -0
  14. datahub/ingestion/api/source.py +7 -2
  15. datahub/ingestion/api/source_helpers.py +14 -2
  16. datahub/ingestion/extractor/schema_util.py +1 -0
  17. datahub/ingestion/graph/client.py +26 -20
  18. datahub/ingestion/graph/filters.py +62 -17
  19. datahub/ingestion/sink/datahub_rest.py +2 -2
  20. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  21. datahub/ingestion/source/common/data_platforms.py +23 -0
  22. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  23. datahub/ingestion/source/common/subtypes.py +17 -1
  24. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  25. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  26. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  27. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  28. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  29. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  30. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  31. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  32. datahub/ingestion/source/ge_data_profiler.py +11 -1
  33. datahub/ingestion/source/hex/__init__.py +0 -0
  34. datahub/ingestion/source/hex/api.py +394 -0
  35. datahub/ingestion/source/hex/constants.py +3 -0
  36. datahub/ingestion/source/hex/hex.py +167 -0
  37. datahub/ingestion/source/hex/mapper.py +372 -0
  38. datahub/ingestion/source/hex/model.py +68 -0
  39. datahub/ingestion/source/iceberg/iceberg.py +193 -140
  40. datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
  41. datahub/ingestion/source/mlflow.py +217 -8
  42. datahub/ingestion/source/mode.py +11 -1
  43. datahub/ingestion/source/openapi.py +69 -34
  44. datahub/ingestion/source/powerbi/config.py +31 -4
  45. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  46. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
  47. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  48. datahub/ingestion/source/powerbi/powerbi.py +41 -24
  49. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
  50. datahub/ingestion/source/redshift/lineage_v2.py +9 -1
  51. datahub/ingestion/source/redshift/query.py +1 -1
  52. datahub/ingestion/source/s3/source.py +11 -0
  53. datahub/ingestion/source/sigma/config.py +3 -4
  54. datahub/ingestion/source/sigma/sigma.py +10 -6
  55. datahub/ingestion/source/slack/slack.py +399 -82
  56. datahub/ingestion/source/snowflake/constants.py +1 -0
  57. datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
  58. datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
  59. datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
  60. datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
  61. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
  62. datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
  63. datahub/ingestion/source/sql/mssql/job_models.py +15 -1
  64. datahub/ingestion/source/sql/mssql/source.py +8 -4
  65. datahub/ingestion/source/sql/oracle.py +51 -4
  66. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  67. datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
  68. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
  69. datahub/ingestion/source/superset.py +291 -35
  70. datahub/ingestion/source/usage/usage_common.py +0 -65
  71. datahub/ingestion/source/vertexai/__init__.py +0 -0
  72. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  73. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  74. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  75. datahub/metadata/_schema_classes.py +472 -1
  76. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  77. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  78. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  79. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  80. datahub/metadata/schema.avsc +313 -2
  81. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  82. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  83. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  84. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  85. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  86. datahub/metadata/schemas/Deprecation.avsc +2 -0
  87. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  88. datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
  89. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  90. datahub/metadata/schemas/Siblings.avsc +2 -0
  91. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  92. datahub/sdk/__init__.py +1 -0
  93. datahub/sdk/dataset.py +122 -0
  94. datahub/sdk/entity.py +99 -3
  95. datahub/sdk/entity_client.py +27 -3
  96. datahub/sdk/main_client.py +24 -1
  97. datahub/sdk/search_client.py +81 -8
  98. datahub/sdk/search_filters.py +94 -37
  99. datahub/sql_parsing/split_statements.py +17 -3
  100. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  101. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  102. datahub/testing/mcp_diff.py +1 -18
  103. datahub/utilities/threaded_iterator_executor.py +16 -3
  104. datahub/ingestion/source/vertexai.py +0 -697
  105. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
  106. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,11 @@
1
+ import json
2
+ import os
1
3
  import time
2
4
  from dataclasses import dataclass
3
- from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
5
+ from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union
4
6
 
5
7
  from mlflow import MlflowClient
6
- from mlflow.entities import Experiment, Run
8
+ from mlflow.entities import Dataset as MlflowDataset, Experiment, Run
7
9
  from mlflow.entities.model_registry import ModelVersion, RegisteredModel
8
10
  from mlflow.store.entities import PagedList
9
11
  from pydantic.fields import Field
@@ -29,6 +31,7 @@ from datahub.ingestion.api.source import (
29
31
  SourceReport,
30
32
  )
31
33
  from datahub.ingestion.api.workunit import MetadataWorkUnit
34
+ from datahub.ingestion.source.common.data_platforms import KNOWN_VALID_PLATFORM_NAMES
32
35
  from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
33
36
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
34
37
  StaleEntityRemovalHandler,
@@ -42,6 +45,7 @@ from datahub.metadata.schema_classes import (
42
45
  AuditStampClass,
43
46
  ContainerClass,
44
47
  DataPlatformInstanceClass,
48
+ DataProcessInstanceInputClass,
45
49
  DataProcessInstanceOutputClass,
46
50
  DataProcessInstancePropertiesClass,
47
51
  DataProcessInstanceRunEventClass,
@@ -60,16 +64,15 @@ from datahub.metadata.schema_classes import (
60
64
  TagAssociationClass,
61
65
  TagPropertiesClass,
62
66
  TimeStampClass,
67
+ UpstreamClass,
68
+ UpstreamLineageClass,
63
69
  VersionPropertiesClass,
64
70
  VersionTagClass,
65
71
  _Aspect,
66
72
  )
67
- from datahub.metadata.urns import (
68
- DataPlatformUrn,
69
- MlModelUrn,
70
- VersionSetUrn,
71
- )
73
+ from datahub.metadata.urns import DataPlatformUrn, DatasetUrn, MlModelUrn, VersionSetUrn
72
74
  from datahub.sdk.container import Container
75
+ from datahub.sdk.dataset import Dataset
73
76
 
74
77
  T = TypeVar("T")
75
78
 
@@ -105,6 +108,20 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
105
108
  " If neither is set, external URLs are not generated."
106
109
  ),
107
110
  )
111
+ materialize_dataset_inputs: Optional[bool] = Field(
112
+ default=False,
113
+ description="Whether to materialize dataset inputs for each run",
114
+ )
115
+ source_mapping_to_platform: Optional[dict] = Field(
116
+ default=None, description="Mapping of source type to datahub platform"
117
+ )
118
+
119
+ username: Optional[str] = Field(
120
+ default=None, description="Username for MLflow authentication"
121
+ )
122
+ password: Optional[str] = Field(
123
+ default=None, description="Password for MLflow authentication"
124
+ )
108
125
 
109
126
 
110
127
  @dataclass
@@ -152,7 +169,17 @@ class MLflowSource(StatefulIngestionSourceBase):
152
169
  self.ctx = ctx
153
170
  self.config = config
154
171
  self.report = StaleEntityRemovalSourceReport()
155
- self.client = MlflowClient(
172
+ self.client = self._configure_client()
173
+
174
+ def _configure_client(self) -> MlflowClient:
175
+ if bool(self.config.username) != bool(self.config.password):
176
+ raise ValueError("Both username and password must be set together")
177
+
178
+ if self.config.username and self.config.password:
179
+ os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.username
180
+ os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.password
181
+
182
+ return MlflowClient(
156
183
  tracking_uri=self.config.tracking_uri,
157
184
  registry_uri=self.config.registry_uri,
158
185
  )
@@ -213,6 +240,7 @@ class MLflowSource(StatefulIngestionSourceBase):
213
240
  if runs:
214
241
  for run in runs:
215
242
  yield from self._get_run_workunits(experiment, run)
243
+ yield from self._get_dataset_input_workunits(run)
216
244
 
217
245
  def _get_experiment_custom_properties(self, experiment):
218
246
  experiment_custom_props = getattr(experiment, "tags", {}) or {}
@@ -262,6 +290,183 @@ class MLflowSource(StatefulIngestionSourceBase):
262
290
  type="SKIPPED", nativeResultType=self.platform
263
291
  )
264
292
 
293
+ def _get_dataset_schema(
294
+ self, dataset: MlflowDataset
295
+ ) -> Optional[List[Tuple[str, str]]]:
296
+ try:
297
+ schema_dict = json.loads(dataset.schema)
298
+ except json.JSONDecodeError:
299
+ self.report.warning(
300
+ title="Failed to load dataset schema",
301
+ message="Schema metadata will be missing due to a JSON parsing error.",
302
+ context=f"Dataset: {dataset.name}, Schema: {dataset.schema}",
303
+ )
304
+ return None
305
+
306
+ if "mlflow_colspec" in schema_dict:
307
+ try:
308
+ return [
309
+ (field["name"], field["type"])
310
+ for field in schema_dict["mlflow_colspec"]
311
+ ]
312
+ except (KeyError, TypeError):
313
+ return None
314
+ # If the schema is not formatted, return None
315
+ return None
316
+
317
+ def _get_external_dataset_urn(self, platform: str, dataset_name: str) -> str:
318
+ """
319
+ Get the URN for an external dataset.
320
+ Args:
321
+ platform: The platform of the external dataset (e.g., 's3', 'bigquery')
322
+ dataset: The MLflow dataset
323
+ Returns:
324
+ str: The URN of the external dataset
325
+ """
326
+ return str(DatasetUrn(platform=platform, name=dataset_name))
327
+
328
+ def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
329
+ """
330
+ Generate workunits for dataset inputs in a run.
331
+
332
+ For each dataset input:
333
+ 1. If source type is 'local' or 'code':
334
+ - Create a local dataset reference
335
+ 2. Otherwise:
336
+ - If materialization is enabled:
337
+ - Create a hosted dataset and a dataset reference with upstream
338
+ - If materialization is not enabled:
339
+ - Create a dataset reference and add upstream if dataset exists
340
+ 3. Add all dataset references as upstreams for the run
341
+ """
342
+ run_urn = DataProcessInstance(
343
+ id=run.info.run_id,
344
+ orchestrator=self.platform,
345
+ ).urn
346
+
347
+ dataset_reference_urns = []
348
+
349
+ for dataset_input in run.inputs.dataset_inputs:
350
+ dataset = dataset_input.dataset
351
+ source_type = dataset.source_type
352
+ dataset_tags = {k[1]: v[1] for k, v in dataset_input.tags}
353
+
354
+ # Prepare dataset properties
355
+ custom_properties = dataset_tags
356
+ formatted_schema = self._get_dataset_schema(dataset)
357
+ if formatted_schema is None:
358
+ custom_properties["schema"] = dataset.schema
359
+
360
+ # Handle local/code datasets
361
+ if source_type in ("local", "code"):
362
+ local_dataset = Dataset(
363
+ platform=self.platform,
364
+ name=dataset.name,
365
+ schema=formatted_schema,
366
+ custom_properties=custom_properties,
367
+ )
368
+ yield from local_dataset.as_workunits()
369
+ dataset_reference_urns.append(local_dataset.urn)
370
+ continue
371
+
372
+ # Handle hosted datasets
373
+ formatted_platform = self._get_dataset_platform_from_source_type(
374
+ source_type
375
+ )
376
+
377
+ # Validate platform if materialization is enabled
378
+ if self.config.materialize_dataset_inputs:
379
+ if not formatted_platform:
380
+ self.report.failure(
381
+ title="Unable to materialize dataset inputs",
382
+ message=f"No mapping dataPlatform found for dataset input source type '{source_type}'",
383
+ context=f"please add `materialize_dataset_inputs.source_mapping_to_platform` in config "
384
+ f"(e.g. '{source_type}': 'snowflake')",
385
+ )
386
+ continue
387
+ # Create hosted dataset
388
+ hosted_dataset = Dataset(
389
+ platform=formatted_platform,
390
+ name=dataset.name,
391
+ schema=formatted_schema,
392
+ custom_properties=dataset_tags,
393
+ )
394
+ yield from hosted_dataset.as_workunits()
395
+
396
+ # Create dataset reference with upstream
397
+ hosted_dataset_reference = Dataset(
398
+ platform=self.platform,
399
+ name=dataset.name,
400
+ schema=formatted_schema,
401
+ custom_properties=dataset_tags,
402
+ upstreams=UpstreamLineageClass(
403
+ upstreams=[
404
+ UpstreamClass(
405
+ self._get_external_dataset_urn(
406
+ formatted_platform, dataset.name
407
+ ),
408
+ type="COPY",
409
+ )
410
+ ]
411
+ )
412
+ if formatted_platform
413
+ else None,
414
+ )
415
+ dataset_reference_urns.append(hosted_dataset_reference.urn)
416
+ yield from hosted_dataset_reference.as_workunits()
417
+
418
+ # Add dataset references as upstreams for the run
419
+ if dataset_reference_urns:
420
+ input_edges = [
421
+ EdgeClass(destinationUrn=str(dataset_ref_urn))
422
+ for dataset_ref_urn in dataset_reference_urns
423
+ ]
424
+ yield MetadataChangeProposalWrapper(
425
+ entityUrn=str(run_urn),
426
+ aspect=DataProcessInstanceInputClass(inputs=[], inputEdges=input_edges),
427
+ ).as_workunit()
428
+
429
+ def _get_dataset_platform_from_source_type(self, source_type: str) -> Optional[str]:
430
+ """
431
+ Map MLflow source type to DataHub platform.
432
+
433
+ Priority:
434
+ 1. User-provided mapping in config
435
+ 2. Internal mapping
436
+ 3. Direct platform match from list of supported platforms
437
+ """
438
+ source_type = source_type.lower()
439
+
440
+ # User-provided mapping
441
+ platform = self._get_platform_from_user_mapping(source_type)
442
+ if platform:
443
+ return platform
444
+
445
+ # Internal mapping
446
+ if source_type == "gs":
447
+ return "gcs"
448
+
449
+ # Check direct platform match
450
+ if self._is_valid_platform(source_type):
451
+ return source_type
452
+
453
+ return None
454
+
455
+ def _get_platform_from_user_mapping(self, source_type: str) -> Optional[str]:
456
+ """
457
+ Get platform from user-provided mapping in config.
458
+ Returns None if mapping is invalid or platform is not supported.
459
+ """
460
+ source_mapping = self.config.source_mapping_to_platform
461
+ if not source_mapping:
462
+ return None
463
+
464
+ platform = source_mapping.get(source_type)
465
+ if not platform:
466
+ return None
467
+
468
+ return platform
469
+
265
470
  def _get_run_workunits(
266
471
  self, experiment: Experiment, run: Run
267
472
  ) -> Iterable[MetadataWorkUnit]:
@@ -659,6 +864,10 @@ class MLflowSource(StatefulIngestionSourceBase):
659
864
  )
660
865
  return wu
661
866
 
867
+ def _is_valid_platform(self, platform: Optional[str]) -> bool:
868
+ """Check if platform is registered as a source plugin"""
869
+ return platform in KNOWN_VALID_PLATFORM_NAMES
870
+
662
871
  @classmethod
663
872
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "MLflowSource":
664
873
  config = MLflowConfig.parse_obj(config_dict)
@@ -33,6 +33,7 @@ from datahub.emitter.mcp_builder import (
33
33
  add_dataset_to_container,
34
34
  gen_containers,
35
35
  )
36
+ from datahub.emitter.request_helper import make_curl_command
36
37
  from datahub.ingestion.api.common import PipelineContext
37
38
  from datahub.ingestion.api.decorators import (
38
39
  SourceCapability,
@@ -339,7 +340,8 @@ class ModeSource(StatefulIngestionSourceBase):
339
340
 
340
341
  # Test the connection
341
342
  try:
342
- self._get_request_json(f"{self.config.connect_uri}/api/verify")
343
+ key_info = self._get_request_json(f"{self.config.connect_uri}/api/verify")
344
+ logger.debug(f"Auth info: {key_info}")
343
345
  except ModeRequestError as e:
344
346
  self.report.report_failure(
345
347
  title="Failed to Connect",
@@ -1485,12 +1487,17 @@ class ModeSource(StatefulIngestionSourceBase):
1485
1487
 
1486
1488
  @r.wraps
1487
1489
  def get_request():
1490
+ curl_command = make_curl_command(self.session, "GET", url, "")
1491
+ logger.debug(f"Issuing request; curl equivalent: {curl_command}")
1492
+
1488
1493
  try:
1489
1494
  response = self.session.get(
1490
1495
  url, timeout=self.config.api_options.timeout
1491
1496
  )
1492
1497
  if response.status_code == 204: # No content, don't parse json
1493
1498
  return {}
1499
+
1500
+ response.raise_for_status()
1494
1501
  return response.json()
1495
1502
  except HTTPError as http_error:
1496
1503
  error_response = http_error.response
@@ -1501,6 +1508,9 @@ class ModeSource(StatefulIngestionSourceBase):
1501
1508
  time.sleep(float(sleep_time))
1502
1509
  raise HTTPError429 from None
1503
1510
 
1511
+ logger.debug(
1512
+ f"Error response ({error_response.status_code}): {error_response.text}"
1513
+ )
1504
1514
  raise http_error
1505
1515
 
1506
1516
  return get_request()
@@ -2,13 +2,14 @@ import logging
2
2
  import time
3
3
  import warnings
4
4
  from abc import ABC
5
- from typing import Dict, Iterable, Optional, Tuple
5
+ from typing import Dict, Iterable, List, Optional, Tuple
6
6
 
7
7
  from pydantic import validator
8
8
  from pydantic.fields import Field
9
9
 
10
10
  from datahub.configuration.common import ConfigModel
11
11
  from datahub.emitter.mce_builder import make_tag_urn
12
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
12
13
  from datahub.ingestion.api.common import PipelineContext
13
14
  from datahub.ingestion.api.decorators import (
14
15
  SourceCapability,
@@ -20,6 +21,7 @@ from datahub.ingestion.api.decorators import (
20
21
  )
21
22
  from datahub.ingestion.api.source import Source, SourceReport
22
23
  from datahub.ingestion.api.workunit import MetadataWorkUnit
24
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
23
25
  from datahub.ingestion.source.openapi_parser import (
24
26
  clean_url,
25
27
  compose_url_attr,
@@ -32,14 +34,13 @@ from datahub.ingestion.source.openapi_parser import (
32
34
  set_metadata,
33
35
  try_guessing,
34
36
  )
35
- from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
36
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
37
37
  from datahub.metadata.schema_classes import (
38
38
  AuditStampClass,
39
39
  DatasetPropertiesClass,
40
40
  GlobalTagsClass,
41
41
  InstitutionalMemoryClass,
42
42
  InstitutionalMemoryMetadataClass,
43
+ SubTypesClass,
43
44
  TagAssociationClass,
44
45
  )
45
46
 
@@ -222,8 +223,9 @@ class APISource(Source, ABC):
222
223
 
223
224
  def init_dataset(
224
225
  self, endpoint_k: str, endpoint_dets: dict
225
- ) -> Tuple[DatasetSnapshot, str]:
226
+ ) -> Tuple[str, str, List[MetadataWorkUnit]]:
226
227
  config = self.config
228
+ workunits = []
227
229
 
228
230
  dataset_name = endpoint_k[1:].replace("/", ".")
229
231
 
@@ -233,22 +235,27 @@ class APISource(Source, ABC):
233
235
  else:
234
236
  dataset_name = "root"
235
237
 
236
- dataset_snapshot = DatasetSnapshot(
237
- urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)",
238
- aspects=[],
239
- )
238
+ dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)"
240
239
 
241
- # adding description
242
- dataset_properties = DatasetPropertiesClass(
240
+ # Create dataset properties aspect
241
+ properties = DatasetPropertiesClass(
243
242
  description=endpoint_dets["description"], customProperties={}
244
243
  )
245
- dataset_snapshot.aspects.append(dataset_properties)
244
+ wu = MetadataWorkUnit(
245
+ id=dataset_name,
246
+ mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=properties),
247
+ )
248
+ workunits.append(wu)
246
249
 
247
- # adding tags
250
+ # Create tags aspect
248
251
  tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]]
249
252
  tags_tac = [TagAssociationClass(t) for t in tags_str]
250
253
  gtc = GlobalTagsClass(tags_tac)
251
- dataset_snapshot.aspects.append(gtc)
254
+ wu = MetadataWorkUnit(
255
+ id=f"{dataset_name}-tags",
256
+ mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=gtc),
257
+ )
258
+ workunits.append(wu)
252
259
 
253
260
  # the link will appear in the "documentation"
254
261
  link_url = clean_url(config.url + self.url_basepath + endpoint_k)
@@ -260,17 +267,25 @@ class APISource(Source, ABC):
260
267
  url=link_url, description=link_description, createStamp=creation
261
268
  )
262
269
  inst_memory = InstitutionalMemoryClass([link_metadata])
263
- dataset_snapshot.aspects.append(inst_memory)
270
+ wu = MetadataWorkUnit(
271
+ id=f"{dataset_name}-docs",
272
+ mcp=MetadataChangeProposalWrapper(
273
+ entityUrn=dataset_urn, aspect=inst_memory
274
+ ),
275
+ )
276
+ workunits.append(wu)
264
277
 
265
- return dataset_snapshot, dataset_name
278
+ # Create subtype aspect
279
+ sub_types = SubTypesClass(typeNames=[DatasetSubTypes.API_ENDPOINT])
280
+ wu = MetadataWorkUnit(
281
+ id=f"{dataset_name}-subtype",
282
+ mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=sub_types),
283
+ )
284
+ workunits.append(wu)
266
285
 
267
- def build_wu(
268
- self, dataset_snapshot: DatasetSnapshot, dataset_name: str
269
- ) -> ApiWorkUnit:
270
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
271
- return ApiWorkUnit(id=dataset_name, mce=mce)
286
+ return dataset_name, dataset_urn, workunits
272
287
 
273
- def get_workunits_internal(self) -> Iterable[ApiWorkUnit]:
288
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
274
289
  config = self.config
275
290
 
276
291
  sw_dict = self.config.get_swagger()
@@ -294,16 +309,24 @@ class APISource(Source, ABC):
294
309
  if endpoint_k in config.ignore_endpoints:
295
310
  continue
296
311
 
297
- dataset_snapshot, dataset_name = self.init_dataset(
312
+ # Initialize dataset and get common aspects
313
+ dataset_name, dataset_urn, workunits = self.init_dataset(
298
314
  endpoint_k, endpoint_dets
299
315
  )
316
+ for wu in workunits:
317
+ yield wu
300
318
 
301
- # adding dataset fields
319
+ # Handle schema metadata if available
302
320
  if "data" in endpoint_dets.keys():
303
321
  # we are lucky! data is defined in the swagger for this endpoint
304
322
  schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
305
- dataset_snapshot.aspects.append(schema_metadata)
306
- yield self.build_wu(dataset_snapshot, dataset_name)
323
+ wu = MetadataWorkUnit(
324
+ id=f"{dataset_name}-schema",
325
+ mcp=MetadataChangeProposalWrapper(
326
+ entityUrn=dataset_urn, aspect=schema_metadata
327
+ ),
328
+ )
329
+ yield wu
307
330
  elif endpoint_dets["method"] != "get":
308
331
  self.report.report_warning(
309
332
  title="Failed to Extract Endpoint Metadata",
@@ -338,9 +361,13 @@ class APISource(Source, ABC):
338
361
  context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
339
362
  )
340
363
  schema_metadata = set_metadata(dataset_name, fields2add)
341
- dataset_snapshot.aspects.append(schema_metadata)
342
-
343
- yield self.build_wu(dataset_snapshot, dataset_name)
364
+ wu = MetadataWorkUnit(
365
+ id=f"{dataset_name}-schema",
366
+ mcp=MetadataChangeProposalWrapper(
367
+ entityUrn=dataset_urn, aspect=schema_metadata
368
+ ),
369
+ )
370
+ yield wu
344
371
  else:
345
372
  self.report_bad_responses(response.status_code, type=endpoint_k)
346
373
  else:
@@ -369,9 +396,13 @@ class APISource(Source, ABC):
369
396
  context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
370
397
  )
371
398
  schema_metadata = set_metadata(dataset_name, fields2add)
372
- dataset_snapshot.aspects.append(schema_metadata)
373
-
374
- yield self.build_wu(dataset_snapshot, dataset_name)
399
+ wu = MetadataWorkUnit(
400
+ id=f"{dataset_name}-schema",
401
+ mcp=MetadataChangeProposalWrapper(
402
+ entityUrn=dataset_urn, aspect=schema_metadata
403
+ ),
404
+ )
405
+ yield wu
375
406
  else:
376
407
  self.report_bad_responses(response.status_code, type=endpoint_k)
377
408
  else:
@@ -400,9 +431,13 @@ class APISource(Source, ABC):
400
431
  context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
401
432
  )
402
433
  schema_metadata = set_metadata(dataset_name, fields2add)
403
- dataset_snapshot.aspects.append(schema_metadata)
404
-
405
- yield self.build_wu(dataset_snapshot, dataset_name)
434
+ wu = MetadataWorkUnit(
435
+ id=f"{dataset_name}-schema",
436
+ mcp=MetadataChangeProposalWrapper(
437
+ entityUrn=dataset_urn, aspect=schema_metadata
438
+ ),
439
+ )
440
+ yield wu
406
441
  else:
407
442
  self.report_bad_responses(response.status_code, type=endpoint_k)
408
443
 
@@ -11,6 +11,9 @@ import datahub.emitter.mce_builder as builder
11
11
  from datahub.configuration.common import AllowDenyPattern, ConfigModel
12
12
  from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
13
13
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
14
+ from datahub.ingestion.api.incremental_lineage_helper import (
15
+ IncrementalLineageConfigMixin,
16
+ )
14
17
  from datahub.ingestion.source.common.subtypes import BIAssetSubTypes
15
18
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
16
19
  StaleEntityRemovalSourceReport,
@@ -19,6 +22,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
19
22
  from datahub.ingestion.source.state.stateful_ingestion_base import (
20
23
  StatefulIngestionConfigBase,
21
24
  )
25
+ from datahub.utilities.global_warning_util import add_global_warning
22
26
  from datahub.utilities.lossy_collections import LossyList
23
27
  from datahub.utilities.perf_timer import PerfTimer
24
28
 
@@ -183,6 +187,11 @@ class SupportedDataPlatform(Enum):
183
187
  datahub_data_platform_name="databricks",
184
188
  )
185
189
 
190
+ MYSQL = DataPlatformPair(
191
+ powerbi_data_platform_name="MySQL",
192
+ datahub_data_platform_name="mysql",
193
+ )
194
+
186
195
 
187
196
  @dataclass
188
197
  class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport):
@@ -275,7 +284,7 @@ class PowerBiProfilingConfig(ConfigModel):
275
284
 
276
285
 
277
286
  class PowerBiDashboardSourceConfig(
278
- StatefulIngestionConfigBase, DatasetSourceConfigMixin
287
+ StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
279
288
  ):
280
289
  platform_name: str = pydantic.Field(
281
290
  default=Constant.PLATFORM_NAME, hidden_from_docs=True
@@ -297,7 +306,15 @@ class PowerBiDashboardSourceConfig(
297
306
  # PowerBi workspace identifier
298
307
  workspace_id_pattern: AllowDenyPattern = pydantic.Field(
299
308
  default=AllowDenyPattern.allow_all(),
300
- description="Regex patterns to filter PowerBI workspaces in ingestion."
309
+ description="Regex patterns to filter PowerBI workspaces in ingestion by ID."
310
+ " By default all IDs are allowed unless they are filtered by name using 'workspace_name_pattern'."
311
+ " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
312
+ )
313
+ # PowerBi workspace name
314
+ workspace_name_pattern: AllowDenyPattern = pydantic.Field(
315
+ default=AllowDenyPattern.allow_all(),
316
+ description="Regex patterns to filter PowerBI workspaces in ingestion by name."
317
+ " By default all names are allowed unless they are filtered by ID using 'workspace_id_pattern'."
301
318
  " Note: This field works in conjunction with 'workspace_type_filter' and both must be considered when filtering workspaces.",
302
319
  )
303
320
 
@@ -373,8 +390,9 @@ class PowerBiDashboardSourceConfig(
373
390
  )
374
391
  # Enable/Disable extracting dataset schema
375
392
  extract_dataset_schema: bool = pydantic.Field(
376
- default=False,
377
- description="Whether to ingest PBI Dataset Table columns and measures",
393
+ default=True,
394
+ description="Whether to ingest PBI Dataset Table columns and measures."
395
+ " Note: this setting must be `true` for schema extraction and column lineage to be enabled.",
378
396
  )
379
397
  # Enable/Disable extracting lineage information of PowerBI Dataset
380
398
  extract_lineage: bool = pydantic.Field(
@@ -510,6 +528,7 @@ class PowerBiDashboardSourceConfig(
510
528
  "native_query_parsing",
511
529
  "enable_advance_lineage_sql_construct",
512
530
  "extract_lineage",
531
+ "extract_dataset_schema",
513
532
  ]
514
533
 
515
534
  if (
@@ -575,3 +594,11 @@ class PowerBiDashboardSourceConfig(
575
594
  )
576
595
 
577
596
  return values
597
+
598
+ @root_validator(skip_on_failure=True)
599
+ def validate_extract_dataset_schema(cls, values: Dict) -> Dict:
600
+ if values.get("extract_dataset_schema") is False:
601
+ add_global_warning(
602
+ "Please use `extract_dataset_schema: true`, otherwise dataset schema extraction will be skipped."
603
+ )
604
+ return values
@@ -74,3 +74,4 @@ class FunctionName(Enum):
74
74
  GOOGLE_BIGQUERY_DATA_ACCESS = "GoogleBigQuery.Database"
75
75
  AMAZON_REDSHIFT_DATA_ACCESS = "AmazonRedshift.Database"
76
76
  DATABRICK_MULTI_CLOUD_DATA_ACCESS = "DatabricksMultiCloud.Catalogs"
77
+ MYSQL_DATA_ACCESS = "MySQL.Database"