acryl-datahub 1.0.0.1rc1__py3-none-any.whl → 1.0.0.1rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (78) hide show
  1. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/METADATA +2575 -2574
  2. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/RECORD +77 -60
  3. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/emitter/request_helper.py +19 -14
  8. datahub/emitter/rest_emitter.py +2 -2
  9. datahub/ingestion/api/source.py +6 -2
  10. datahub/ingestion/api/source_helpers.py +6 -2
  11. datahub/ingestion/extractor/schema_util.py +1 -0
  12. datahub/ingestion/graph/client.py +6 -11
  13. datahub/ingestion/source/common/data_platforms.py +23 -0
  14. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  15. datahub/ingestion/source/common/subtypes.py +16 -1
  16. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  17. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  18. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  19. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  20. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  21. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  22. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  23. datahub/ingestion/source/hex/__init__.py +0 -0
  24. datahub/ingestion/source/hex/api.py +394 -0
  25. datahub/ingestion/source/hex/constants.py +3 -0
  26. datahub/ingestion/source/hex/hex.py +167 -0
  27. datahub/ingestion/source/hex/mapper.py +372 -0
  28. datahub/ingestion/source/hex/model.py +68 -0
  29. datahub/ingestion/source/iceberg/iceberg.py +62 -66
  30. datahub/ingestion/source/mlflow.py +217 -8
  31. datahub/ingestion/source/mode.py +11 -1
  32. datahub/ingestion/source/openapi.py +69 -34
  33. datahub/ingestion/source/powerbi/powerbi.py +29 -23
  34. datahub/ingestion/source/s3/source.py +11 -0
  35. datahub/ingestion/source/slack/slack.py +399 -82
  36. datahub/ingestion/source/snowflake/constants.py +1 -0
  37. datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
  38. datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
  39. datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
  40. datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
  41. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
  42. datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
  43. datahub/ingestion/source/sql/mssql/job_models.py +15 -1
  44. datahub/ingestion/source/sql/mssql/source.py +8 -4
  45. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  46. datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
  47. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
  48. datahub/ingestion/source/superset.py +15 -6
  49. datahub/ingestion/source/vertexai/__init__.py +0 -0
  50. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  51. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  52. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  53. datahub/metadata/_schema_classes.py +472 -1
  54. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  55. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  56. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  57. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  58. datahub/metadata/schema.avsc +309 -0
  59. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  60. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  61. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  62. datahub/metadata/schemas/Deprecation.avsc +2 -0
  63. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  64. datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
  65. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  66. datahub/metadata/schemas/Siblings.avsc +2 -0
  67. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  68. datahub/sdk/dataset.py +122 -0
  69. datahub/sdk/entity.py +99 -3
  70. datahub/sdk/entity_client.py +27 -3
  71. datahub/sdk/main_client.py +22 -0
  72. datahub/sdk/search_filters.py +4 -4
  73. datahub/sql_parsing/split_statements.py +5 -1
  74. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  75. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  76. datahub/ingestion/source/vertexai.py +0 -695
  77. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info/licenses}/LICENSE +0 -0
  78. {acryl_datahub-1.0.0.1rc1.dist-info → acryl_datahub-1.0.0.1rc3.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,11 @@
1
+ import json
2
+ import os
1
3
  import time
2
4
  from dataclasses import dataclass
3
- from typing import Any, Callable, Iterable, List, Optional, TypeVar, Union
5
+ from typing import Any, Callable, Iterable, List, Optional, Tuple, TypeVar, Union
4
6
 
5
7
  from mlflow import MlflowClient
6
- from mlflow.entities import Experiment, Run
8
+ from mlflow.entities import Dataset as MlflowDataset, Experiment, Run
7
9
  from mlflow.entities.model_registry import ModelVersion, RegisteredModel
8
10
  from mlflow.store.entities import PagedList
9
11
  from pydantic.fields import Field
@@ -29,6 +31,7 @@ from datahub.ingestion.api.source import (
29
31
  SourceReport,
30
32
  )
31
33
  from datahub.ingestion.api.workunit import MetadataWorkUnit
34
+ from datahub.ingestion.source.common.data_platforms import KNOWN_VALID_PLATFORM_NAMES
32
35
  from datahub.ingestion.source.common.subtypes import MLAssetSubTypes
33
36
  from datahub.ingestion.source.state.stale_entity_removal_handler import (
34
37
  StaleEntityRemovalHandler,
@@ -42,6 +45,7 @@ from datahub.metadata.schema_classes import (
42
45
  AuditStampClass,
43
46
  ContainerClass,
44
47
  DataPlatformInstanceClass,
48
+ DataProcessInstanceInputClass,
45
49
  DataProcessInstanceOutputClass,
46
50
  DataProcessInstancePropertiesClass,
47
51
  DataProcessInstanceRunEventClass,
@@ -60,16 +64,15 @@ from datahub.metadata.schema_classes import (
60
64
  TagAssociationClass,
61
65
  TagPropertiesClass,
62
66
  TimeStampClass,
67
+ UpstreamClass,
68
+ UpstreamLineageClass,
63
69
  VersionPropertiesClass,
64
70
  VersionTagClass,
65
71
  _Aspect,
66
72
  )
67
- from datahub.metadata.urns import (
68
- DataPlatformUrn,
69
- MlModelUrn,
70
- VersionSetUrn,
71
- )
73
+ from datahub.metadata.urns import DataPlatformUrn, DatasetUrn, MlModelUrn, VersionSetUrn
72
74
  from datahub.sdk.container import Container
75
+ from datahub.sdk.dataset import Dataset
73
76
 
74
77
  T = TypeVar("T")
75
78
 
@@ -105,6 +108,20 @@ class MLflowConfig(StatefulIngestionConfigBase, EnvConfigMixin):
105
108
  " If neither is set, external URLs are not generated."
106
109
  ),
107
110
  )
111
+ materialize_dataset_inputs: Optional[bool] = Field(
112
+ default=False,
113
+ description="Whether to materialize dataset inputs for each run",
114
+ )
115
+ source_mapping_to_platform: Optional[dict] = Field(
116
+ default=None, description="Mapping of source type to datahub platform"
117
+ )
118
+
119
+ username: Optional[str] = Field(
120
+ default=None, description="Username for MLflow authentication"
121
+ )
122
+ password: Optional[str] = Field(
123
+ default=None, description="Password for MLflow authentication"
124
+ )
108
125
 
109
126
 
110
127
  @dataclass
@@ -152,7 +169,17 @@ class MLflowSource(StatefulIngestionSourceBase):
152
169
  self.ctx = ctx
153
170
  self.config = config
154
171
  self.report = StaleEntityRemovalSourceReport()
155
- self.client = MlflowClient(
172
+ self.client = self._configure_client()
173
+
174
+ def _configure_client(self) -> MlflowClient:
175
+ if bool(self.config.username) != bool(self.config.password):
176
+ raise ValueError("Both username and password must be set together")
177
+
178
+ if self.config.username and self.config.password:
179
+ os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.username
180
+ os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.password
181
+
182
+ return MlflowClient(
156
183
  tracking_uri=self.config.tracking_uri,
157
184
  registry_uri=self.config.registry_uri,
158
185
  )
@@ -213,6 +240,7 @@ class MLflowSource(StatefulIngestionSourceBase):
213
240
  if runs:
214
241
  for run in runs:
215
242
  yield from self._get_run_workunits(experiment, run)
243
+ yield from self._get_dataset_input_workunits(run)
216
244
 
217
245
  def _get_experiment_custom_properties(self, experiment):
218
246
  experiment_custom_props = getattr(experiment, "tags", {}) or {}
@@ -262,6 +290,183 @@ class MLflowSource(StatefulIngestionSourceBase):
262
290
  type="SKIPPED", nativeResultType=self.platform
263
291
  )
264
292
 
293
+ def _get_dataset_schema(
294
+ self, dataset: MlflowDataset
295
+ ) -> Optional[List[Tuple[str, str]]]:
296
+ try:
297
+ schema_dict = json.loads(dataset.schema)
298
+ except json.JSONDecodeError:
299
+ self.report.warning(
300
+ title="Failed to load dataset schema",
301
+ message="Schema metadata will be missing due to a JSON parsing error.",
302
+ context=f"Dataset: {dataset.name}, Schema: {dataset.schema}",
303
+ )
304
+ return None
305
+
306
+ if "mlflow_colspec" in schema_dict:
307
+ try:
308
+ return [
309
+ (field["name"], field["type"])
310
+ for field in schema_dict["mlflow_colspec"]
311
+ ]
312
+ except (KeyError, TypeError):
313
+ return None
314
+ # If the schema is not formatted, return None
315
+ return None
316
+
317
+ def _get_external_dataset_urn(self, platform: str, dataset_name: str) -> str:
318
+ """
319
+ Get the URN for an external dataset.
320
+ Args:
321
+ platform: The platform of the external dataset (e.g., 's3', 'bigquery')
322
+ dataset: The MLflow dataset
323
+ Returns:
324
+ str: The URN of the external dataset
325
+ """
326
+ return str(DatasetUrn(platform=platform, name=dataset_name))
327
+
328
+ def _get_dataset_input_workunits(self, run: Run) -> Iterable[MetadataWorkUnit]:
329
+ """
330
+ Generate workunits for dataset inputs in a run.
331
+
332
+ For each dataset input:
333
+ 1. If source type is 'local' or 'code':
334
+ - Create a local dataset reference
335
+ 2. Otherwise:
336
+ - If materialization is enabled:
337
+ - Create a hosted dataset and a dataset reference with upstream
338
+ - If materialization is not enabled:
339
+ - Create a dataset reference and add upstream if dataset exists
340
+ 3. Add all dataset references as upstreams for the run
341
+ """
342
+ run_urn = DataProcessInstance(
343
+ id=run.info.run_id,
344
+ orchestrator=self.platform,
345
+ ).urn
346
+
347
+ dataset_reference_urns = []
348
+
349
+ for dataset_input in run.inputs.dataset_inputs:
350
+ dataset = dataset_input.dataset
351
+ source_type = dataset.source_type
352
+ dataset_tags = {k[1]: v[1] for k, v in dataset_input.tags}
353
+
354
+ # Prepare dataset properties
355
+ custom_properties = dataset_tags
356
+ formatted_schema = self._get_dataset_schema(dataset)
357
+ if formatted_schema is None:
358
+ custom_properties["schema"] = dataset.schema
359
+
360
+ # Handle local/code datasets
361
+ if source_type in ("local", "code"):
362
+ local_dataset = Dataset(
363
+ platform=self.platform,
364
+ name=dataset.name,
365
+ schema=formatted_schema,
366
+ custom_properties=custom_properties,
367
+ )
368
+ yield from local_dataset.as_workunits()
369
+ dataset_reference_urns.append(local_dataset.urn)
370
+ continue
371
+
372
+ # Handle hosted datasets
373
+ formatted_platform = self._get_dataset_platform_from_source_type(
374
+ source_type
375
+ )
376
+
377
+ # Validate platform if materialization is enabled
378
+ if self.config.materialize_dataset_inputs:
379
+ if not formatted_platform:
380
+ self.report.failure(
381
+ title="Unable to materialize dataset inputs",
382
+ message=f"No mapping dataPlatform found for dataset input source type '{source_type}'",
383
+ context=f"please add `materialize_dataset_inputs.source_mapping_to_platform` in config "
384
+ f"(e.g. '{source_type}': 'snowflake')",
385
+ )
386
+ continue
387
+ # Create hosted dataset
388
+ hosted_dataset = Dataset(
389
+ platform=formatted_platform,
390
+ name=dataset.name,
391
+ schema=formatted_schema,
392
+ custom_properties=dataset_tags,
393
+ )
394
+ yield from hosted_dataset.as_workunits()
395
+
396
+ # Create dataset reference with upstream
397
+ hosted_dataset_reference = Dataset(
398
+ platform=self.platform,
399
+ name=dataset.name,
400
+ schema=formatted_schema,
401
+ custom_properties=dataset_tags,
402
+ upstreams=UpstreamLineageClass(
403
+ upstreams=[
404
+ UpstreamClass(
405
+ self._get_external_dataset_urn(
406
+ formatted_platform, dataset.name
407
+ ),
408
+ type="COPY",
409
+ )
410
+ ]
411
+ )
412
+ if formatted_platform
413
+ else None,
414
+ )
415
+ dataset_reference_urns.append(hosted_dataset_reference.urn)
416
+ yield from hosted_dataset_reference.as_workunits()
417
+
418
+ # Add dataset references as upstreams for the run
419
+ if dataset_reference_urns:
420
+ input_edges = [
421
+ EdgeClass(destinationUrn=str(dataset_ref_urn))
422
+ for dataset_ref_urn in dataset_reference_urns
423
+ ]
424
+ yield MetadataChangeProposalWrapper(
425
+ entityUrn=str(run_urn),
426
+ aspect=DataProcessInstanceInputClass(inputs=[], inputEdges=input_edges),
427
+ ).as_workunit()
428
+
429
+ def _get_dataset_platform_from_source_type(self, source_type: str) -> Optional[str]:
430
+ """
431
+ Map MLflow source type to DataHub platform.
432
+
433
+ Priority:
434
+ 1. User-provided mapping in config
435
+ 2. Internal mapping
436
+ 3. Direct platform match from list of supported platforms
437
+ """
438
+ source_type = source_type.lower()
439
+
440
+ # User-provided mapping
441
+ platform = self._get_platform_from_user_mapping(source_type)
442
+ if platform:
443
+ return platform
444
+
445
+ # Internal mapping
446
+ if source_type == "gs":
447
+ return "gcs"
448
+
449
+ # Check direct platform match
450
+ if self._is_valid_platform(source_type):
451
+ return source_type
452
+
453
+ return None
454
+
455
+ def _get_platform_from_user_mapping(self, source_type: str) -> Optional[str]:
456
+ """
457
+ Get platform from user-provided mapping in config.
458
+ Returns None if mapping is invalid or platform is not supported.
459
+ """
460
+ source_mapping = self.config.source_mapping_to_platform
461
+ if not source_mapping:
462
+ return None
463
+
464
+ platform = source_mapping.get(source_type)
465
+ if not platform:
466
+ return None
467
+
468
+ return platform
469
+
265
470
  def _get_run_workunits(
266
471
  self, experiment: Experiment, run: Run
267
472
  ) -> Iterable[MetadataWorkUnit]:
@@ -659,6 +864,10 @@ class MLflowSource(StatefulIngestionSourceBase):
659
864
  )
660
865
  return wu
661
866
 
867
+ def _is_valid_platform(self, platform: Optional[str]) -> bool:
868
+ """Check if platform is registered as a source plugin"""
869
+ return platform in KNOWN_VALID_PLATFORM_NAMES
870
+
662
871
  @classmethod
663
872
  def create(cls, config_dict: dict, ctx: PipelineContext) -> "MLflowSource":
664
873
  config = MLflowConfig.parse_obj(config_dict)
@@ -33,6 +33,7 @@ from datahub.emitter.mcp_builder import (
33
33
  add_dataset_to_container,
34
34
  gen_containers,
35
35
  )
36
+ from datahub.emitter.request_helper import make_curl_command
36
37
  from datahub.ingestion.api.common import PipelineContext
37
38
  from datahub.ingestion.api.decorators import (
38
39
  SourceCapability,
@@ -339,7 +340,8 @@ class ModeSource(StatefulIngestionSourceBase):
339
340
 
340
341
  # Test the connection
341
342
  try:
342
- self._get_request_json(f"{self.config.connect_uri}/api/verify")
343
+ key_info = self._get_request_json(f"{self.config.connect_uri}/api/verify")
344
+ logger.debug(f"Auth info: {key_info}")
343
345
  except ModeRequestError as e:
344
346
  self.report.report_failure(
345
347
  title="Failed to Connect",
@@ -1485,12 +1487,17 @@ class ModeSource(StatefulIngestionSourceBase):
1485
1487
 
1486
1488
  @r.wraps
1487
1489
  def get_request():
1490
+ curl_command = make_curl_command(self.session, "GET", url, "")
1491
+ logger.debug(f"Issuing request; curl equivalent: {curl_command}")
1492
+
1488
1493
  try:
1489
1494
  response = self.session.get(
1490
1495
  url, timeout=self.config.api_options.timeout
1491
1496
  )
1492
1497
  if response.status_code == 204: # No content, don't parse json
1493
1498
  return {}
1499
+
1500
+ response.raise_for_status()
1494
1501
  return response.json()
1495
1502
  except HTTPError as http_error:
1496
1503
  error_response = http_error.response
@@ -1501,6 +1508,9 @@ class ModeSource(StatefulIngestionSourceBase):
1501
1508
  time.sleep(float(sleep_time))
1502
1509
  raise HTTPError429 from None
1503
1510
 
1511
+ logger.debug(
1512
+ f"Error response ({error_response.status_code}): {error_response.text}"
1513
+ )
1504
1514
  raise http_error
1505
1515
 
1506
1516
  return get_request()
@@ -2,13 +2,14 @@ import logging
2
2
  import time
3
3
  import warnings
4
4
  from abc import ABC
5
- from typing import Dict, Iterable, Optional, Tuple
5
+ from typing import Dict, Iterable, List, Optional, Tuple
6
6
 
7
7
  from pydantic import validator
8
8
  from pydantic.fields import Field
9
9
 
10
10
  from datahub.configuration.common import ConfigModel
11
11
  from datahub.emitter.mce_builder import make_tag_urn
12
+ from datahub.emitter.mcp import MetadataChangeProposalWrapper
12
13
  from datahub.ingestion.api.common import PipelineContext
13
14
  from datahub.ingestion.api.decorators import (
14
15
  SourceCapability,
@@ -20,6 +21,7 @@ from datahub.ingestion.api.decorators import (
20
21
  )
21
22
  from datahub.ingestion.api.source import Source, SourceReport
22
23
  from datahub.ingestion.api.workunit import MetadataWorkUnit
24
+ from datahub.ingestion.source.common.subtypes import DatasetSubTypes
23
25
  from datahub.ingestion.source.openapi_parser import (
24
26
  clean_url,
25
27
  compose_url_attr,
@@ -32,14 +34,13 @@ from datahub.ingestion.source.openapi_parser import (
32
34
  set_metadata,
33
35
  try_guessing,
34
36
  )
35
- from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
36
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
37
37
  from datahub.metadata.schema_classes import (
38
38
  AuditStampClass,
39
39
  DatasetPropertiesClass,
40
40
  GlobalTagsClass,
41
41
  InstitutionalMemoryClass,
42
42
  InstitutionalMemoryMetadataClass,
43
+ SubTypesClass,
43
44
  TagAssociationClass,
44
45
  )
45
46
 
@@ -222,8 +223,9 @@ class APISource(Source, ABC):
222
223
 
223
224
  def init_dataset(
224
225
  self, endpoint_k: str, endpoint_dets: dict
225
- ) -> Tuple[DatasetSnapshot, str]:
226
+ ) -> Tuple[str, str, List[MetadataWorkUnit]]:
226
227
  config = self.config
228
+ workunits = []
227
229
 
228
230
  dataset_name = endpoint_k[1:].replace("/", ".")
229
231
 
@@ -233,22 +235,27 @@ class APISource(Source, ABC):
233
235
  else:
234
236
  dataset_name = "root"
235
237
 
236
- dataset_snapshot = DatasetSnapshot(
237
- urn=f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)",
238
- aspects=[],
239
- )
238
+ dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{self.platform},{config.name}.{dataset_name},PROD)"
240
239
 
241
- # adding description
242
- dataset_properties = DatasetPropertiesClass(
240
+ # Create dataset properties aspect
241
+ properties = DatasetPropertiesClass(
243
242
  description=endpoint_dets["description"], customProperties={}
244
243
  )
245
- dataset_snapshot.aspects.append(dataset_properties)
244
+ wu = MetadataWorkUnit(
245
+ id=dataset_name,
246
+ mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=properties),
247
+ )
248
+ workunits.append(wu)
246
249
 
247
- # adding tags
250
+ # Create tags aspect
248
251
  tags_str = [make_tag_urn(t) for t in endpoint_dets["tags"]]
249
252
  tags_tac = [TagAssociationClass(t) for t in tags_str]
250
253
  gtc = GlobalTagsClass(tags_tac)
251
- dataset_snapshot.aspects.append(gtc)
254
+ wu = MetadataWorkUnit(
255
+ id=f"{dataset_name}-tags",
256
+ mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=gtc),
257
+ )
258
+ workunits.append(wu)
252
259
 
253
260
  # the link will appear in the "documentation"
254
261
  link_url = clean_url(config.url + self.url_basepath + endpoint_k)
@@ -260,17 +267,25 @@ class APISource(Source, ABC):
260
267
  url=link_url, description=link_description, createStamp=creation
261
268
  )
262
269
  inst_memory = InstitutionalMemoryClass([link_metadata])
263
- dataset_snapshot.aspects.append(inst_memory)
270
+ wu = MetadataWorkUnit(
271
+ id=f"{dataset_name}-docs",
272
+ mcp=MetadataChangeProposalWrapper(
273
+ entityUrn=dataset_urn, aspect=inst_memory
274
+ ),
275
+ )
276
+ workunits.append(wu)
264
277
 
265
- return dataset_snapshot, dataset_name
278
+ # Create subtype aspect
279
+ sub_types = SubTypesClass(typeNames=[DatasetSubTypes.API_ENDPOINT])
280
+ wu = MetadataWorkUnit(
281
+ id=f"{dataset_name}-subtype",
282
+ mcp=MetadataChangeProposalWrapper(entityUrn=dataset_urn, aspect=sub_types),
283
+ )
284
+ workunits.append(wu)
266
285
 
267
- def build_wu(
268
- self, dataset_snapshot: DatasetSnapshot, dataset_name: str
269
- ) -> ApiWorkUnit:
270
- mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
271
- return ApiWorkUnit(id=dataset_name, mce=mce)
286
+ return dataset_name, dataset_urn, workunits
272
287
 
273
- def get_workunits_internal(self) -> Iterable[ApiWorkUnit]:
288
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
274
289
  config = self.config
275
290
 
276
291
  sw_dict = self.config.get_swagger()
@@ -294,16 +309,24 @@ class APISource(Source, ABC):
294
309
  if endpoint_k in config.ignore_endpoints:
295
310
  continue
296
311
 
297
- dataset_snapshot, dataset_name = self.init_dataset(
312
+ # Initialize dataset and get common aspects
313
+ dataset_name, dataset_urn, workunits = self.init_dataset(
298
314
  endpoint_k, endpoint_dets
299
315
  )
316
+ for wu in workunits:
317
+ yield wu
300
318
 
301
- # adding dataset fields
319
+ # Handle schema metadata if available
302
320
  if "data" in endpoint_dets.keys():
303
321
  # we are lucky! data is defined in the swagger for this endpoint
304
322
  schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
305
- dataset_snapshot.aspects.append(schema_metadata)
306
- yield self.build_wu(dataset_snapshot, dataset_name)
323
+ wu = MetadataWorkUnit(
324
+ id=f"{dataset_name}-schema",
325
+ mcp=MetadataChangeProposalWrapper(
326
+ entityUrn=dataset_urn, aspect=schema_metadata
327
+ ),
328
+ )
329
+ yield wu
307
330
  elif endpoint_dets["method"] != "get":
308
331
  self.report.report_warning(
309
332
  title="Failed to Extract Endpoint Metadata",
@@ -338,9 +361,13 @@ class APISource(Source, ABC):
338
361
  context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
339
362
  )
340
363
  schema_metadata = set_metadata(dataset_name, fields2add)
341
- dataset_snapshot.aspects.append(schema_metadata)
342
-
343
- yield self.build_wu(dataset_snapshot, dataset_name)
364
+ wu = MetadataWorkUnit(
365
+ id=f"{dataset_name}-schema",
366
+ mcp=MetadataChangeProposalWrapper(
367
+ entityUrn=dataset_urn, aspect=schema_metadata
368
+ ),
369
+ )
370
+ yield wu
344
371
  else:
345
372
  self.report_bad_responses(response.status_code, type=endpoint_k)
346
373
  else:
@@ -369,9 +396,13 @@ class APISource(Source, ABC):
369
396
  context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
370
397
  )
371
398
  schema_metadata = set_metadata(dataset_name, fields2add)
372
- dataset_snapshot.aspects.append(schema_metadata)
373
-
374
- yield self.build_wu(dataset_snapshot, dataset_name)
399
+ wu = MetadataWorkUnit(
400
+ id=f"{dataset_name}-schema",
401
+ mcp=MetadataChangeProposalWrapper(
402
+ entityUrn=dataset_urn, aspect=schema_metadata
403
+ ),
404
+ )
405
+ yield wu
375
406
  else:
376
407
  self.report_bad_responses(response.status_code, type=endpoint_k)
377
408
  else:
@@ -400,9 +431,13 @@ class APISource(Source, ABC):
400
431
  context=f"Endpoint Type: {endpoint_k}, Name: {dataset_name}",
401
432
  )
402
433
  schema_metadata = set_metadata(dataset_name, fields2add)
403
- dataset_snapshot.aspects.append(schema_metadata)
404
-
405
- yield self.build_wu(dataset_snapshot, dataset_name)
434
+ wu = MetadataWorkUnit(
435
+ id=f"{dataset_name}-schema",
436
+ mcp=MetadataChangeProposalWrapper(
437
+ entityUrn=dataset_urn, aspect=schema_metadata
438
+ ),
439
+ )
440
+ yield wu
406
441
  else:
407
442
  self.report_bad_responses(response.status_code, type=endpoint_k)
408
443
 
@@ -666,6 +666,7 @@ class Mapper:
666
666
  workspace: powerbi_data_classes.Workspace,
667
667
  chart_mcps: List[MetadataChangeProposalWrapper],
668
668
  user_mcps: List[MetadataChangeProposalWrapper],
669
+ dashboard_edges: List[EdgeClass],
669
670
  ) -> List[MetadataChangeProposalWrapper]:
670
671
  """
671
672
  Map PowerBi dashboard to Datahub dashboard
@@ -695,6 +696,7 @@ class Mapper:
695
696
  lastModified=ChangeAuditStamps(),
696
697
  dashboardUrl=dashboard.webUrl,
697
698
  customProperties={**chart_custom_properties(dashboard)},
699
+ dashboards=dashboard_edges,
698
700
  )
699
701
 
700
702
  info_mcp = self.new_mcp(
@@ -933,7 +935,7 @@ class Mapper:
933
935
  dashboard: powerbi_data_classes.Dashboard,
934
936
  workspace: powerbi_data_classes.Workspace,
935
937
  ) -> List[EquableMetadataWorkUnit]:
936
- mcps = []
938
+ mcps: List[MetadataChangeProposalWrapper] = []
937
939
 
938
940
  logger.info(
939
941
  f"Converting dashboard={dashboard.displayName} to datahub dashboard"
@@ -945,9 +947,30 @@ class Mapper:
945
947
  )
946
948
  # Convert tiles to charts
947
949
  ds_mcps, chart_mcps = self.to_datahub_chart(dashboard.tiles, workspace)
950
+
951
+ # collect all downstream reports (dashboards)
952
+ dashboard_edges = []
953
+ for t in dashboard.tiles:
954
+ if t.report:
955
+ dashboard_urn = builder.make_dashboard_urn(
956
+ platform=self.__config.platform_name,
957
+ platform_instance=self.__config.platform_instance,
958
+ name=t.report.get_urn_part(),
959
+ )
960
+ edge = EdgeClass(
961
+ destinationUrn=dashboard_urn,
962
+ )
963
+ dashboard_edges.append(edge)
964
+
948
965
  # Lets convert dashboard to datahub dashboard
949
966
  dashboard_mcps: List[MetadataChangeProposalWrapper] = (
950
- self.to_datahub_dashboard_mcp(dashboard, workspace, chart_mcps, user_mcps)
967
+ self.to_datahub_dashboard_mcp(
968
+ dashboard=dashboard,
969
+ workspace=workspace,
970
+ chart_mcps=chart_mcps,
971
+ user_mcps=user_mcps,
972
+ dashboard_edges=dashboard_edges,
973
+ )
951
974
  )
952
975
 
953
976
  # Now add MCPs in sequence
@@ -1054,7 +1077,6 @@ class Mapper:
1054
1077
  report: powerbi_data_classes.Report,
1055
1078
  chart_mcps: List[MetadataChangeProposalWrapper],
1056
1079
  user_mcps: List[MetadataChangeProposalWrapper],
1057
- dashboard_edges: List[EdgeClass],
1058
1080
  ) -> List[MetadataChangeProposalWrapper]:
1059
1081
  """
1060
1082
  Map PowerBi report to Datahub dashboard
@@ -1076,7 +1098,6 @@ class Mapper:
1076
1098
  charts=chart_urn_list,
1077
1099
  lastModified=ChangeAuditStamps(),
1078
1100
  dashboardUrl=report.webUrl,
1079
- dashboards=dashboard_edges,
1080
1101
  )
1081
1102
 
1082
1103
  info_mcp = self.new_mcp(
@@ -1170,27 +1191,12 @@ class Mapper:
1170
1191
  ds_mcps = self.to_datahub_dataset(report.dataset, workspace)
1171
1192
  chart_mcps = self.pages_to_chart(report.pages, workspace, ds_mcps)
1172
1193
 
1173
- # find all dashboards with a Tile referencing this report
1174
- downstream_dashboards_edges = []
1175
- for d in workspace.dashboards.values():
1176
- if any(t.report_id == report.id for t in d.tiles):
1177
- dashboard_urn = builder.make_dashboard_urn(
1178
- platform=self.__config.platform_name,
1179
- platform_instance=self.__config.platform_instance,
1180
- name=d.get_urn_part(),
1181
- )
1182
- edge = EdgeClass(
1183
- destinationUrn=dashboard_urn,
1184
- sourceUrn=None,
1185
- created=None,
1186
- lastModified=None,
1187
- properties=None,
1188
- )
1189
- downstream_dashboards_edges.append(edge)
1190
-
1191
1194
  # Let's convert report to datahub dashboard
1192
1195
  report_mcps = self.report_to_dashboard(
1193
- workspace, report, chart_mcps, user_mcps, downstream_dashboards_edges
1196
+ workspace=workspace,
1197
+ report=report,
1198
+ chart_mcps=chart_mcps,
1199
+ user_mcps=user_mcps,
1194
1200
  )
1195
1201
 
1196
1202
  # Now add MCPs in sequence
@@ -945,6 +945,17 @@ class S3Source(StatefulIngestionSourceBase):
945
945
  for f in list_folders(
946
946
  bucket_name, f"{folder}", self.source_config.aws_config
947
947
  ):
948
+ table_path = self.create_s3_path(bucket_name, f)
949
+ table_name, _ = path_spec.extract_table_name_and_path(
950
+ table_path
951
+ )
952
+ if not path_spec.tables_filter_pattern.allowed(table_name):
953
+ logger.debug(
954
+ f"Table '{table_name}' not allowed and skipping"
955
+ )
956
+ self.report.report_file_dropped(table_path)
957
+ continue
958
+
948
959
  dirs_to_process = []
949
960
  logger.info(f"Processing folder: {f}")
950
961
  if path_spec.traversal_method == FolderTraversalMethod.ALL: