acryl-datahub 1.0.0.1rc1__py3-none-any.whl → 1.0.0.1rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (62) hide show
  1. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/METADATA +2471 -2470
  2. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/RECORD +61 -46
  3. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/emitter/request_helper.py +19 -14
  8. datahub/ingestion/api/source.py +6 -2
  9. datahub/ingestion/api/source_helpers.py +6 -2
  10. datahub/ingestion/extractor/schema_util.py +1 -0
  11. datahub/ingestion/source/common/data_platforms.py +23 -0
  12. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  13. datahub/ingestion/source/common/subtypes.py +15 -0
  14. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  15. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  16. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  17. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  18. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  19. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  20. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  21. datahub/ingestion/source/hex/__init__.py +0 -0
  22. datahub/ingestion/source/hex/api.py +394 -0
  23. datahub/ingestion/source/hex/constants.py +3 -0
  24. datahub/ingestion/source/hex/hex.py +167 -0
  25. datahub/ingestion/source/hex/mapper.py +372 -0
  26. datahub/ingestion/source/hex/model.py +68 -0
  27. datahub/ingestion/source/iceberg/iceberg.py +62 -66
  28. datahub/ingestion/source/mlflow.py +198 -7
  29. datahub/ingestion/source/mode.py +11 -1
  30. datahub/ingestion/source/openapi.py +69 -34
  31. datahub/ingestion/source/powerbi/powerbi.py +29 -23
  32. datahub/ingestion/source/s3/source.py +11 -0
  33. datahub/ingestion/source/slack/slack.py +399 -82
  34. datahub/ingestion/source/superset.py +15 -6
  35. datahub/ingestion/source/vertexai/__init__.py +0 -0
  36. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  37. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  38. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  39. datahub/metadata/_schema_classes.py +472 -1
  40. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  41. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  42. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  43. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  44. datahub/metadata/schema.avsc +307 -0
  45. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  46. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  47. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  48. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  49. datahub/metadata/schemas/MetadataChangeEvent.avsc +30 -0
  50. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  51. datahub/metadata/schemas/Siblings.avsc +2 -0
  52. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  53. datahub/sdk/dataset.py +122 -0
  54. datahub/sdk/entity.py +99 -3
  55. datahub/sdk/entity_client.py +27 -3
  56. datahub/sdk/main_client.py +22 -0
  57. datahub/sdk/search_filters.py +4 -4
  58. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  59. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  60. datahub/ingestion/source/vertexai.py +0 -695
  61. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc2.dist-info/licenses}/LICENSE +0 -0
  62. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,10 @@
1
+ import json
1
2
  import time
2
3
  from dataclasses import dataclass
3
- from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
4
+ from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union
4
5
 
5
6
  from mlflow import MlflowClient
6
- from mlflow.entities import Experiment, Run
7
+ from mlflow.entities import Dataset as MlflowDataset, Experiment, Run
7
8
  from mlflow.entities.model_registry import ModelVersion, RegisteredModel
8
9
  from mlflow.store.entities import PagedList
9
10
  from pydantic.fields import Field
@@ -29,6 +30,7 @@ from datahub.ingestion.api.source import (
29
30
  SourceReport,
30
31
  )
31
32
  from datahub.ingestion.api.workunit import MetadataWorkUnit
33
+ from datahub.ingestion.source.common.data_platforms import KNOWN_VALID_PLATFORM_NAMES
32
34
  from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
33
35
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
34
36
  StaleEntityRemovalHandler,
@@ -42,6 +44,7 @@ from datahub.metadata.schema_classes import (
42
44
  AuditStampClass,
43
45
  ContainerClass,
44
46
  DataPlatformInstanceClass,
47
+ DataProcessInstanceInputClass,
45
48
  DataProcessInstanceOutputClass,
46
49
  DataProcessInstancePropertiesClass,
47
50
  DataProcessInstanceRunEventClass,
@@ -60,16 +63,15 @@ from datahub.metadata.schema_classes import (
60
63
  TagAssociationClass,
61
64
  TagPropertiesClass,
62
65
  TimeStampClass,
66
+ UpstreamClass,
67
+ UpstreamLineageClass,
63
68
  VersionPropertiesClass,
64
69
  VersionTagClass,
65
70
  _Aspect,
66
71
  )
67
- from datahub.metadata.urns import (
68
- DataPlatformUrn,
69
- MlModelUrn,
70
- VersionSetUrn,
71
- )
72
+ from datahub.metadata.urns import DataPlatformUrn, DatasetUrn, MlModelUrn, VersionSetUrn
72
73
  from datahub.sdk.container import Container
74
+ from datahub.sdk.dataset import Dataset
73
75
 
74
76
  T = TypeVar("T")
75
77
 
@@ -105,6 +107,13 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
105
107
  " If neither is set, external URLs are not generated."
106
108
  ),
107
109
  )
110
+ materialize_dataset_inputs: Optional[bool] = Field(
111
+ default=False,
112
+ description="Whether to materialize dataset inputs for each run",
113
+ )
114
+ source_mapping_to_platform: Optional[dict] = Field(
115
+ default=None, description="Mapping of source type to datahub platform"
116
+ )
108
117
 
109
118
 
110
119
  @dataclass
@@ -213,6 +222,7 @@ class MLflowSource(StatefulIngestionSourceBase):
213
222
  if runs:
214
223
  for run in runs:
215
224
  yield from self._get_run_workunits(experiment, run)
225
+ yield from self._get_dataset_input_workunits(run)
216
226
 
217
227
  def _get_experiment_custom_properties(self, experiment):
218
228
  experiment_custom_props = getattr(experiment, "tags", {}) or {}
@@ -262,6 +272,183 @@ class MLflowSource(StatefulIngestionSourceBase):
262
272
  type="SKIPPED", nativeResultType=self.platform
263
273
  )
264
274
 
275
+ def _get_dataset_schema(
276
+ self, dataset: MlflowDataset
277
+ ) -> Optional[List[Tuple[str, str]]]:
278
+ try:
279
+ schema_dict = json.loads(dataset.schema)
280
+ except json.JSONDecodeError:
281
+ self.report.warning(
282
+ title="Failed to load dataset schema",
283
+ message="Schema metadata will be missing due to a JSON parsing error.",
284
+ context=f"Dataset: {dataset.name}, Schema: {dataset.schema}",
285
+ )
286
+ return None
287
+
288
+ if "mlflow_colspec" in schema_dict:
289
+ try:
290
+ return [
291
+ (field["name"], field["type"])
292
+ for field in schema_dict["mlflow_colspec"]
293
+ ]
294
+ except (KeyError, TypeError):
295
+ return None
296
+ # If the schema is not formatted, return None
297
+ return None
298
+
299
+ def _get_external_dataset_urn(self, platform: str, dataset_name: str) -> str:
300
+ """
301
+ Get the URN for an external dataset.
302
+ Args:
303
+ platform: The platform of the external dataset (e.g., 's3', 'bigquery')
304
+ dataset: The MLflow dataset
305
+ Returns:
306
+ str: The URN of the external dataset
307
+ """
308
+ return str(DatasetUrn(platform=platform, name=dataset_name))
309
+
310
+ def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
311
+ """
312
+ Generate workunits for dataset inputs in a run.
313
+
314
+ For each dataset input:
315
+ 1. If source type is 'local' or 'code':
316
+ - Create a local dataset reference
317
+ 2. Otherwise:
318
+ - If materialization is enabled:
319
+ - Create a hosted dataset and a dataset reference with upstream
320
+ - If materialization is not enabled:
321
+ - Create a dataset reference and add upstream if dataset exists
322
+ 3. Add all dataset references as upstreams for the run
323
+ """
324
+ run_urn = DataProcessInstance(
325
+ id=run.info.run_id,
326
+ orchestrator=self.platform,
327
+ ).urn
328
+
329
+ dataset_reference_urns = []
330
+
331
+ for dataset_input in run.inputs.dataset_inputs:
332
+ dataset = dataset_input.dataset
333
+ source_type = dataset.source_type
334
+ dataset_tags = {k[1]: v[1] for k, v in dataset_input.tags}
335
+
336
+ # Prepare dataset properties
337
+ custom_properties = dataset_tags
338
+ formatted_schema = self._get_dataset_schema(dataset)
339
+ if formatted_schema is None:
340
+ custom_properties["schema"] = dataset.schema
341
+
342
+ # Handle local/code datasets
343
+ if source_type in ("local", "code"):
344
+ local_dataset = Dataset(
345
+ platform=self.platform,
346
+ name=dataset.name,
347
+ schema=formatted_schema,
348
+ custom_properties=custom_properties,
349
+ )
350
+ yield from local_dataset.as_workunits()
351
+ dataset_reference_urns.append(local_dataset.urn)
352
+ continue
353
+
354
+ # Handle hosted datasets
355
+ formatted_platform = self._get_dataset_platform_from_source_type(
356
+ source_type
357
+ )
358
+
359
+ # Validate platform if materialization is enabled
360
+ if self.config.materialize_dataset_inputs:
361
+ if not formatted_platform:
362
+ self.report.failure(
363
+ title="Unable to materialize dataset inputs",
364
+ message=f"No mapping dataPlatform found for dataset input source type '{source_type}'",
365
+ context=f"please add `materialize_dataset_inputs.source_mapping_to_platform` in config "
366
+ f"(e.g. '{source_type}': 'snowflake')",
367
+ )
368
+ continue
369
+ # Create hosted dataset
370
+ hosted_dataset = Dataset(
371
+ platform=formatted_platform,
372
+ name=dataset.name,
373
+ schema=formatted_schema,
374
+ custom_properties=dataset_tags,
375
+ )
376
+ yield from hosted_dataset.as_workunits()
377
+
378
+ # Create dataset reference with upstream
379
+ hosted_dataset_reference = Dataset(
380
+ platform=self.platform,
381
+ name=dataset.name,
382
+ schema=formatted_schema,
383
+ custom_properties=dataset_tags,
384
+ upstreams=UpstreamLineageClass(
385
+ upstreams=[
386
+ UpstreamClass(
387
+ self._get_external_dataset_urn(
388
+ formatted_platform, dataset.name
389
+ ),
390
+ type="COPY",
391
+ )
392
+ ]
393
+ )
394
+ if formatted_platform
395
+ else None,
396
+ )
397
+ dataset_reference_urns.append(hosted_dataset_reference.urn)
398
+ yield from hosted_dataset_reference.as_workunits()
399
+
400
+ # Add dataset references as upstreams for the run
401
+ if dataset_reference_urns:
402
+ input_edges = [
403
+ EdgeClass(destinationUrn=str(dataset_ref_urn))
404
+ for dataset_ref_urn in dataset_reference_urns
405
+ ]
406
+ yield MetadataChangeProposalWrapper(
407
+ entityUrn=str(run_urn),
408
+ aspect=DataProcessInstanceInputClass(inputs=[], inputEdges=input_edges),
409
+ ).as_workunit()
410
+
411
+ def _get_dataset_platform_from_source_type(self, source_type: str) -> Optional[str]:
412
+ """
413
+ Map MLflow source type to DataHub platform.
414
+
415
+ Priority:
416
+ 1. User-provided mapping in config
417
+ 2. Internal mapping
418
+ 3. Direct platform match from list of supported platforms
419
+ """
420
+ source_type = source_type.lower()
421
+
422
+ # User-provided mapping
423
+ platform = self._get_platform_from_user_mapping(source_type)
424
+ if platform:
425
+ return platform
426
+
427
+ # Internal mapping
428
+ if source_type == "gs":
429
+ return "gcs"
430
+
431
+ # Check direct platform match
432
+ if self._is_valid_platform(source_type):
433
+ return source_type
434
+
435
+ return None
436
+
437
+ def _get_platform_from_user_mapping(self, source_type: str) -> Optional[str]:
438
+ """
439
+ Get platform from user-provided mapping in config.
440
+ Returns None if mapping is invalid or platform is not supported.
441
+ """
442
+ source_mapping = self.config.source_mapping_to_platform
443
+ if not source_mapping:
444
+ return None
445
+
446
+ platform = source_mapping.get(source_type)
447
+ if not platform:
448
+ return None
449
+
450
+ return platform
451
+
265
452
  def _get_run_workunits(
266
453
  self, experiment: Experiment, run: Run
267
454
  ) -> Iterable[MetadataWorkUnit]:
@@ -659,6 +846,10 @@ class MLflowSource(StatefulIngestionSourceBase):
659
846
  )
660
847
  return wu
661
848
 
849
+ def _is_valid_platform(self, platform: Optional[str]) -> bool:
850
+ """Check if platform is registered as a source plugin"""
851
+ return platform in KNOWN_VALID_PLATFORM_NAMES
852
+
662
853
  @classmethod
663
854
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "MLflowSource":
664
855
  config = MLflowConfig.parse_obj(config_dict)
@@ -33,6 +33,7 @@ from datahub.emitter.mcp_builder import (
33
33
  add_dataset_to_container,
34
34
  gen_containers,
35
35
  )
36
+ from datahub.emitter.request_helper import make_curl_command
36
37
  from datahub.ingestion.api.common import PipelineContext
37
38
  from datahub.ingestion.api.decorators import (
38
39
  SourceCapability,
@@ -339,7 +340,8 @@ class ModeSource(StatefulIngestionSourceBase):
339
340
 
340
341
  # Test the connection
341
342
  try:
342
- self._get_request_json(f"{self.config.connect_uri}/api/verify")
343
+ key_info = self._get_request_json(f"{self.config.connect_uri}/api/verify")
344
+ logger.debug(f"Auth info: {key_info}")
343
345
  except ModeRequestError as e:
344
346
  self.report.report_failure(
345
347
  title="Failed to Connect",
@@ -1485,12 +1487,17 @@ class ModeSource(StatefulIngestionSourceBase):
1485
1487
 
1486
1488
  @r.wraps
1487
1489
  def get_request():
1490
+ curl_command = make_curl_command(self.session, "GET", url, "")
1491
+ logger.debug(f"Issuing request; curl equivalent: {curl_command}")
1492
+
1488
1493
  try:
1489
1494
  response = self.session.get(
1490
1495
  url, timeout=self.config.api_options.timeout
1491
1496
  )
1492
1497
  if response.status_code == 204: # No content, don't parse json
1493
1498
  return {}
1499
+
1500
+ response.raise_for_status()
1494
1501
  return response.json()
1495
1502
  except HTTPError as http_error:
1496
1503
  error_response = http_error.response
@@ -1501,6 +1508,9 @@ class ModeSource(StatefulIngestionSourceBase):
1501
1508
  time.sleep(float(sleep_time))
1502
1509
  raise HTTPError429 from None
1503
1510
 
1511
+ logger.debug(
1512
+ f"Error response ({error_response.status_code}): {error_response.text}"
1513
+ )
1504
1514
  raise http_error
1505
1515
 
1506
1516
  return get_request()
@@ -2,13 +2,14 @@ import logging
2
2
  import time
3
3
  import warnings
4
4
  from abc import ABC
5
- from typing import Dict, Iterable, Optional, Tuple
5
+ from typing import Dict, Iterable, List, Optional, Tuple
6
6
 
7
7
  from pydantic import validator
8
8
  from pydantic.fields import Field
9
9
 
10
10
  from datahub.configuration.common import ConfigModel
11
11
  from datahub.emitter.mce_builder import make_tag_urn
12
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
12
13
  from datahub.ingestion.api.common import PipelineContext
13
14
  from datahub.ingestion.api.decorators import (
14
15
  SourceCapability,
@@ -20,6 +21,7 @@ from datahub.ingestion.api.decorators import (
20
21
  )
21
22
  from datahub.ingestion.api.source import Source, SourceReport
22
23
  from datahub.ingestion.api.workunit import MetadataWorkUnit
24
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
23
25
  from datahub.ingestion.source.openapi_parser import (
24
26
  clean_url,
25
27
  compose_url_attr,
@@ -32,14 +34,13 @@ from datahub.ingestion.source.openapi_parser import (
32
34
  set_metadata,
33
35
  try_guessing,
34
36
  )
35
- from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
36
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
37
37
  from datahub.metadata.schema_classes import (
38
38
  AuditStampClass,
39
39
  DatasetPropertiesClass,
40
40
  GlobalTagsClass,
41
41
  InstitutionalMemoryClass,
42
42
  InstitutionalMemoryMetadataClass,
43
+ SubTypesClass,
43
44
  TagAssociationClass,
44
45
  )
45
46
 
@@ -222,8 +223,9 @@ class APISource(Source, ABC):
222
223
 
223
224
  def init_dataset(
224
225
  self, endpoint_k: str, endpoint_dets: dict
225
- ) -> Tuple[DatasetSnapshot, str]:
226
+ ) -> Tuple[str, str, List[MetadataWorkUnit]]:
226
227
  config = self.config
228
+ workunits = []
227
229
 
228
230
  dataset_name = endpoint_k[1:].replace("/", ".")
229
231
 
@@ -233,22 +235,27 @@ class APISource(Source, ABC):
233
235
  else:
234
236
  dataset_name = "root"
235
237
 
236
- dataset_snapshot = DatasetSnapshot(
237
- urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)",
238
- aspects=[],
239
- )
238
+ dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)"
240
239
 
241
- # adding description
242
- dataset_properties = DatasetPropertiesClass(
240
+ # Create dataset properties aspect
241
+ properties = DatasetPropertiesClass(
243
242
  description=endpoint_dets["description"], customProperties={}
244
243
  )
245
- dataset_snapshot.aspects.append(dataset_properties)
244
+ wu = MetadataWorkUnit(
245
+ id=dataset_name,
246
+ mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=properties),
247
+ )
248
+ workunits.append(wu)
246
249
 
247
- # adding tags
250
+ # Create tags aspect
248
251
  tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]]
249
252
  tags_tac = [TagAssociationClass(t) for t in tags_str]
250
253
  gtc = GlobalTagsClass(tags_tac)
251
- dataset_snapshot.aspects.append(gtc)
254
+ wu = MetadataWorkUnit(
255
+ id=f"{dataset_name}-tags",
256
+ mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=gtc),
257
+ )
258
+ workunits.append(wu)
252
259
 
253
260
  # the link will appear in the "documentation"
254
261
  link_url = clean_url(config.url + self.url_basepath + endpoint_k)
@@ -260,17 +267,25 @@ class APISource(Source, ABC):
260
267
  url=link_url, description=link_description, createStamp=creation
261
268
  )
262
269
  inst_memory = InstitutionalMemoryClass([link_metadata])
263
- dataset_snapshot.aspects.append(inst_memory)
270
+ wu = MetadataWorkUnit(
271
+ id=f"{dataset_name}-docs",
272
+ mcp=MetadataChangeProposalWrapper(
273
+ entityUrn=dataset_urn, aspect=inst_memory
274
+ ),
275
+ )
276
+ workunits.append(wu)
264
277
 
265
- return dataset_snapshot, dataset_name
278
+ # Create subtype aspect
279
+ sub_types = SubTypesClass(typeNames=[DatasetSubTypes.API_ENDPOINT])
280
+ wu = MetadataWorkUnit(
281
+ id=f"{dataset_name}-subtype",
282
+ mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=sub_types),
283
+ )
284
+ workunits.append(wu)
266
285
 
267
- def build_wu(
268
- self, dataset_snapshot: DatasetSnapshot, dataset_name: str
269
- ) -> ApiWorkUnit:
270
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
271
- return ApiWorkUnit(id=dataset_name, mce=mce)
286
+ return dataset_name, dataset_urn, workunits
272
287
 
273
- def get_workunits_internal(self) -> Iterable[ApiWorkUnit]:
288
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
274
289
  config = self.config
275
290
 
276
291
  sw_dict = self.config.get_swagger()
@@ -294,16 +309,24 @@ class APISource(Source, ABC):
294
309
  if endpoint_k in config.ignore_endpoints:
295
310
  continue
296
311
 
297
- dataset_snapshot, dataset_name = self.init_dataset(
312
+ # Initialize dataset and get common aspects
313
+ dataset_name, dataset_urn, workunits = self.init_dataset(
298
314
  endpoint_k, endpoint_dets
299
315
  )
316
+ for wu in workunits:
317
+ yield wu
300
318
 
301
- # adding dataset fields
319
+ # Handle schema metadata if available
302
320
  if "data" in endpoint_dets.keys():
303
321
  # we are lucky! data is defined in the swagger for this endpoint
304
322
  schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
305
- dataset_snapshot.aspects.append(schema_metadata)
306
- yield self.build_wu(dataset_snapshot, dataset_name)
323
+ wu = MetadataWorkUnit(
324
+ id=f"{dataset_name}-schema",
325
+ mcp=MetadataChangeProposalWrapper(
326
+ entityUrn=dataset_urn, aspect=schema_metadata
327
+ ),
328
+ )
329
+ yield wu
307
330
  elif endpoint_dets["method"] != "get":
308
331
  self.report.report_warning(
309
332
  title="Failed to Extract Endpoint Metadata",
@@ -338,9 +361,13 @@ class APISource(Source, ABC):
338
361
  context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
339
362
  )
340
363
  schema_metadata = set_metadata(dataset_name, fields2add)
341
- dataset_snapshot.aspects.append(schema_metadata)
342
-
343
- yield self.build_wu(dataset_snapshot, dataset_name)
364
+ wu = MetadataWorkUnit(
365
+ id=f"{dataset_name}-schema",
366
+ mcp=MetadataChangeProposalWrapper(
367
+ entityUrn=dataset_urn, aspect=schema_metadata
368
+ ),
369
+ )
370
+ yield wu
344
371
  else:
345
372
  self.report_bad_responses(response.status_code, type=endpoint_k)
346
373
  else:
@@ -369,9 +396,13 @@ class APISource(Source, ABC):
369
396
  context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
370
397
  )
371
398
  schema_metadata = set_metadata(dataset_name, fields2add)
372
- dataset_snapshot.aspects.append(schema_metadata)
373
-
374
- yield self.build_wu(dataset_snapshot, dataset_name)
399
+ wu = MetadataWorkUnit(
400
+ id=f"{dataset_name}-schema",
401
+ mcp=MetadataChangeProposalWrapper(
402
+ entityUrn=dataset_urn, aspect=schema_metadata
403
+ ),
404
+ )
405
+ yield wu
375
406
  else:
376
407
  self.report_bad_responses(response.status_code, type=endpoint_k)
377
408
  else:
@@ -400,9 +431,13 @@ class APISource(Source, ABC):
400
431
  context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
401
432
  )
402
433
  schema_metadata = set_metadata(dataset_name, fields2add)
403
- dataset_snapshot.aspects.append(schema_metadata)
404
-
405
- yield self.build_wu(dataset_snapshot, dataset_name)
434
+ wu = MetadataWorkUnit(
435
+ id=f"{dataset_name}-schema",
436
+ mcp=MetadataChangeProposalWrapper(
437
+ entityUrn=dataset_urn, aspect=schema_metadata
438
+ ),
439
+ )
440
+ yield wu
406
441
  else:
407
442
  self.report_bad_responses(response.status_code, type=endpoint_k)
408
443
 
@@ -666,6 +666,7 @@ class Mapper:
666
666
  workspace: powerbi_data_classes.Workspace,
667
667
  chart_mcps: List[MetadataChangeProposalWrapper],
668
668
  user_mcps: List[MetadataChangeProposalWrapper],
669
+ dashboard_edges: List[EdgeClass],
669
670
  ) -> List[MetadataChangeProposalWrapper]:
670
671
  """
671
672
  Map PowerBi dashboard to Datahub dashboard
@@ -695,6 +696,7 @@ class Mapper:
695
696
  lastModified=ChangeAuditStamps(),
696
697
  dashboardUrl=dashboard.webUrl,
697
698
  customProperties={**chart_custom_properties(dashboard)},
699
+ dashboards=dashboard_edges,
698
700
  )
699
701
 
700
702
  info_mcp = self.new_mcp(
@@ -933,7 +935,7 @@ class Mapper:
933
935
  dashboard: powerbi_data_classes.Dashboard,
934
936
  workspace: powerbi_data_classes.Workspace,
935
937
  ) -> List[EquableMetadataWorkUnit]:
936
- mcps = []
938
+ mcps: List[MetadataChangeProposalWrapper] = []
937
939
 
938
940
  logger.info(
939
941
  f"Converting dashboard={dashboard.displayName} to datahub dashboard"
@@ -945,9 +947,30 @@ class Mapper:
945
947
  )
946
948
  # Convert tiles to charts
947
949
  ds_mcps, chart_mcps = self.to_datahub_chart(dashboard.tiles, workspace)
950
+
951
+ # collect all downstream reports (dashboards)
952
+ dashboard_edges = []
953
+ for t in dashboard.tiles:
954
+ if t.report:
955
+ dashboard_urn = builder.make_dashboard_urn(
956
+ platform=self.__config.platform_name,
957
+ platform_instance=self.__config.platform_instance,
958
+ name=t.report.get_urn_part(),
959
+ )
960
+ edge = EdgeClass(
961
+ destinationUrn=dashboard_urn,
962
+ )
963
+ dashboard_edges.append(edge)
964
+
948
965
  # Lets convert dashboard to datahub dashboard
949
966
  dashboard_mcps: List[MetadataChangeProposalWrapper] = (
950
- self.to_datahub_dashboard_mcp(dashboard, workspace, chart_mcps, user_mcps)
967
+ self.to_datahub_dashboard_mcp(
968
+ dashboard=dashboard,
969
+ workspace=workspace,
970
+ chart_mcps=chart_mcps,
971
+ user_mcps=user_mcps,
972
+ dashboard_edges=dashboard_edges,
973
+ )
951
974
  )
952
975
 
953
976
  # Now add MCPs in sequence
@@ -1054,7 +1077,6 @@ class Mapper:
1054
1077
  report: powerbi_data_classes.Report,
1055
1078
  chart_mcps: List[MetadataChangeProposalWrapper],
1056
1079
  user_mcps: List[MetadataChangeProposalWrapper],
1057
- dashboard_edges: List[EdgeClass],
1058
1080
  ) -> List[MetadataChangeProposalWrapper]:
1059
1081
  """
1060
1082
  Map PowerBi report to Datahub dashboard
@@ -1076,7 +1098,6 @@ class Mapper:
1076
1098
  charts=chart_urn_list,
1077
1099
  lastModified=ChangeAuditStamps(),
1078
1100
  dashboardUrl=report.webUrl,
1079
- dashboards=dashboard_edges,
1080
1101
  )
1081
1102
 
1082
1103
  info_mcp = self.new_mcp(
@@ -1170,27 +1191,12 @@ class Mapper:
1170
1191
  ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
1171
1192
  chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
1172
1193
 
1173
- # find all dashboards with a Tile referencing this report
1174
- downstream_dashboards_edges = []
1175
- for d in workspace.dashboards.values():
1176
- if any(t.report_id == report.id for t in d.tiles):
1177
- dashboard_urn = builder.make_dashboard_urn(
1178
- platform=self.__config.platform_name,
1179
- platform_instance=self.__config.platform_instance,
1180
- name=d.get_urn_part(),
1181
- )
1182
- edge = EdgeClass(
1183
- destinationUrn=dashboard_urn,
1184
- sourceUrn=None,
1185
- created=None,
1186
- lastModified=None,
1187
- properties=None,
1188
- )
1189
- downstream_dashboards_edges.append(edge)
1190
-
1191
1194
  # Let's convert report to datahub dashboard
1192
1195
  report_mcps = self.report_to_dashboard(
1193
- workspace, report, chart_mcps, user_mcps, downstream_dashboards_edges
1196
+ workspace=workspace,
1197
+ report=report,
1198
+ chart_mcps=chart_mcps,
1199
+ user_mcps=user_mcps,
1194
1200
  )
1195
1201
 
1196
1202
  # Now add MCPs in sequence
@@ -945,6 +945,17 @@ class S3Source(StatefulIngestionSourceBase):
945
945
  for f in list_folders(
946
946
  bucket_name, f"{folder}", self.source_config.aws_config
947
947
  ):
948
+ table_path = self.create_s3_path(bucket_name, f)
949
+ table_name, _ = path_spec.extract_table_name_and_path(
950
+ table_path
951
+ )
952
+ if not path_spec.tables_filter_pattern.allowed(table_name):
953
+ logger.debug(
954
+ f"Table '{table_name}' not allowed and skipping"
955
+ )
956
+ self.report.report_file_dropped(table_path)
957
+ continue
958
+
948
959
  dirs_to_process = []
949
960
  logger.info(f"Processing folder: {f}")
950
961
  if path_spec.traversal_method == FolderTraversalMethod.ALL: