acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (65) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/METADATA +2486 -2487
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/RECORD +64 -49
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/emitter/request_helper.py +19 -14
  8. datahub/ingestion/api/source.py +6 -2
  9. datahub/ingestion/api/source_helpers.py +6 -2
  10. datahub/ingestion/extractor/schema_util.py +1 -0
  11. datahub/ingestion/source/common/data_platforms.py +23 -0
  12. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  13. datahub/ingestion/source/common/subtypes.py +15 -0
  14. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  15. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  16. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  17. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  18. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  19. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  20. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  21. datahub/ingestion/source/hex/__init__.py +0 -0
  22. datahub/ingestion/source/hex/api.py +394 -0
  23. datahub/ingestion/source/hex/constants.py +3 -0
  24. datahub/ingestion/source/hex/hex.py +167 -0
  25. datahub/ingestion/source/hex/mapper.py +372 -0
  26. datahub/ingestion/source/hex/model.py +68 -0
  27. datahub/ingestion/source/iceberg/iceberg.py +62 -66
  28. datahub/ingestion/source/mlflow.py +198 -7
  29. datahub/ingestion/source/mode.py +11 -1
  30. datahub/ingestion/source/openapi.py +69 -34
  31. datahub/ingestion/source/powerbi/powerbi.py +29 -23
  32. datahub/ingestion/source/s3/source.py +11 -0
  33. datahub/ingestion/source/slack/slack.py +399 -82
  34. datahub/ingestion/source/superset.py +138 -22
  35. datahub/ingestion/source/vertexai/__init__.py +0 -0
  36. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  37. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  38. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  39. datahub/metadata/_schema_classes.py +472 -1
  40. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  41. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  42. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  43. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  44. datahub/metadata/schema.avsc +311 -2
  45. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  46. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  47. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  48. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  49. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  50. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  51. datahub/metadata/schemas/MetadataChangeEvent.avsc +30 -0
  52. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  53. datahub/metadata/schemas/Siblings.avsc +2 -0
  54. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  55. datahub/sdk/dataset.py +122 -0
  56. datahub/sdk/entity.py +99 -3
  57. datahub/sdk/entity_client.py +27 -3
  58. datahub/sdk/main_client.py +22 -0
  59. datahub/sdk/search_filters.py +4 -4
  60. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  61. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  62. datahub/testing/mcp_diff.py +1 -18
  63. datahub/ingestion/source/vertexai.py +0 -697
  64. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info/licenses}/LICENSE +0 -0
  65. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ from cached_property import cached_property
11
11
  from pydantic.fields import Field
12
12
  from wcmatch import pathlib
13
13
 
14
- from datahub.configuration.common import ConfigModel
14
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel
15
15
  from datahub.ingestion.source.aws.s3_util import is_s3_uri
16
16
  from datahub.ingestion.source.azure.abs_utils import is_abs_uri
17
17
  from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
@@ -145,6 +145,11 @@ class PathSpec(ConfigModel):
145
145
  description="Include hidden folders in the traversal (folders starting with . or _",
146
146
  )
147
147
 
148
+ tables_filter_pattern: AllowDenyPattern = Field(
149
+ default=AllowDenyPattern.allow_all(),
150
+ description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.",
151
+ )
152
+
148
153
  def is_path_hidden(self, path: str) -> bool:
149
154
  # Split the path into directories and filename
150
155
  dirs, filename = os.path.split(path)
@@ -177,6 +182,12 @@ class PathSpec(ConfigModel):
177
182
  ):
178
183
  return False
179
184
  logger.debug(f"{path} is not excluded")
185
+
186
+ table_name, _ = self.extract_table_name_and_path(path)
187
+ if not self.tables_filter_pattern.allowed(table_name):
188
+ return False
189
+ logger.debug(f"{path} is passed table name check")
190
+
180
191
  ext = os.path.splitext(path)[1].strip(".")
181
192
 
182
193
  if not ignore_ext:
@@ -218,6 +229,15 @@ class PathSpec(ConfigModel):
218
229
  exclude_path.rstrip("/"), flags=pathlib.GLOBSTAR
219
230
  ):
220
231
  return False
232
+
233
+ file_name_pattern = self.include.rsplit("/", 1)[1]
234
+ table_name, _ = self.extract_table_name_and_path(
235
+ os.path.join(path, file_name_pattern)
236
+ )
237
+ if not self.tables_filter_pattern.allowed(table_name):
238
+ return False
239
+ logger.debug(f"{path} is passed table name check")
240
+
221
241
  return True
222
242
 
223
243
  @classmethod
@@ -4,7 +4,7 @@ from abc import abstractmethod
4
4
  from dataclasses import dataclass, field
5
5
  from datetime import datetime
6
6
  from enum import auto
7
- from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
7
+ from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
8
8
 
9
9
  import more_itertools
10
10
  import pydantic
@@ -849,7 +849,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
849
849
  test_nodes: List[DBTNode],
850
850
  extra_custom_props: Dict[str, str],
851
851
  all_nodes_map: Dict[str, DBTNode],
852
- ) -> Iterable[MetadataWorkUnit]:
852
+ ) -> Iterable[MetadataChangeProposalWrapper]:
853
853
  for node in sorted(test_nodes, key=lambda n: n.dbt_name):
854
854
  upstreams = get_upstreams_for_test(
855
855
  test_node=node,
@@ -902,7 +902,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
902
902
  yield MetadataChangeProposalWrapper(
903
903
  entityUrn=assertion_urn,
904
904
  aspect=self._make_data_platform_instance_aspect(),
905
- ).as_workunit()
905
+ )
906
906
 
907
907
  yield make_assertion_from_test(
908
908
  custom_props,
@@ -949,7 +949,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
949
949
  ),
950
950
  )
951
951
 
952
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
952
+ def get_workunits_internal(
953
+ self,
954
+ ) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
953
955
  if self.config.write_semantics == "PATCH":
954
956
  self.ctx.require_graph("Using dbt with write_semantics=PATCH")
955
957
 
@@ -343,6 +343,9 @@ class DBTRunResult(BaseModel):
343
343
  def timing_map(self) -> Dict[str, DBTRunTiming]:
344
344
  return {x.name: x for x in self.timing if x.name}
345
345
 
346
+ def has_success_status(self) -> bool:
347
+ return self.status in ("pass", "success")
348
+
346
349
 
347
350
  class DBTRunMetadata(BaseModel):
348
351
  dbt_schema_version: str
@@ -355,12 +358,7 @@ def _parse_test_result(
355
358
  dbt_metadata: DBTRunMetadata,
356
359
  run_result: DBTRunResult,
357
360
  ) -> Optional[DBTTestResult]:
358
- if run_result.status == "success":
359
- # This was probably a docs generate run result, so this isn't actually
360
- # a test result.
361
- return None
362
-
363
- if run_result.status != "pass":
361
+ if not run_result.has_success_status():
364
362
  native_results = {"message": run_result.message or ""}
365
363
  if run_result.failures:
366
364
  native_results.update({"failures": str(run_result.failures)})
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
6
6
 
7
7
  from datahub.emitter import mce_builder
8
8
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
9
- from datahub.ingestion.api.workunit import MetadataWorkUnit
10
9
  from datahub.metadata.schema_classes import (
11
10
  AssertionInfoClass,
12
11
  AssertionResultClass,
@@ -43,6 +42,9 @@ class DBTTestResult:
43
42
 
44
43
  native_results: Dict[str, str]
45
44
 
45
+ def has_success_status(self) -> bool:
46
+ return self.status in ("pass", "success")
47
+
46
48
 
47
49
  def _get_name_for_relationship_test(kw_args: Dict[str, str]) -> Optional[str]:
48
50
  """
@@ -157,7 +159,7 @@ def make_assertion_from_test(
157
159
  node: "DBTNode",
158
160
  assertion_urn: str,
159
161
  upstream_urn: str,
160
- ) -> MetadataWorkUnit:
162
+ ) -> MetadataChangeProposalWrapper:
161
163
  assert node.test_info
162
164
  qualified_test_name = node.test_info.qualified_test_name
163
165
  column_name = node.test_info.column_name
@@ -231,7 +233,7 @@ def make_assertion_from_test(
231
233
  return MetadataChangeProposalWrapper(
232
234
  entityUrn=assertion_urn,
233
235
  aspect=assertion_info,
234
- ).as_workunit()
236
+ )
235
237
 
236
238
 
237
239
  def make_assertion_result_from_test(
@@ -240,7 +242,7 @@ def make_assertion_result_from_test(
240
242
  assertion_urn: str,
241
243
  upstream_urn: str,
242
244
  test_warnings_are_errors: bool,
243
- ) -> MetadataWorkUnit:
245
+ ) -> MetadataChangeProposalWrapper:
244
246
  assertionResult = AssertionRunEventClass(
245
247
  timestampMillis=int(test_result.execution_time.timestamp() * 1000.0),
246
248
  assertionUrn=assertion_urn,
@@ -249,7 +251,7 @@ def make_assertion_result_from_test(
249
251
  result=AssertionResultClass(
250
252
  type=(
251
253
  AssertionResultTypeClass.SUCCESS
252
- if test_result.status == "pass"
254
+ if test_result.has_success_status()
253
255
  or (not test_warnings_are_errors and test_result.status == "warn")
254
256
  else AssertionResultTypeClass.FAILURE
255
257
  ),
@@ -261,4 +263,4 @@ def make_assertion_result_from_test(
261
263
  return MetadataChangeProposalWrapper(
262
264
  entityUrn=assertion_urn,
263
265
  aspect=assertionResult,
264
- ).as_workunit()
266
+ )
@@ -66,7 +66,7 @@ class DremioToDataHubSourceTypeMapping:
66
66
  }
67
67
 
68
68
  @staticmethod
69
- def get_datahub_source_type(dremio_source_type: str) -> str:
69
+ def get_datahub_platform(dremio_source_type: str) -> str:
70
70
  """
71
71
  Return the DataHub source type.
72
72
  """
@@ -294,7 +294,7 @@ class DremioContainer:
294
294
  )
295
295
 
296
296
 
297
- class DremioSource(DremioContainer):
297
+ class DremioSourceContainer(DremioContainer):
298
298
  subclass: str = "Dremio Source"
299
299
  dremio_source_type: str
300
300
  root_path: Optional[str]
@@ -337,7 +337,7 @@ class DremioCatalog:
337
337
  self.dremio_api = dremio_api
338
338
  self.edition = dremio_api.edition
339
339
  self.datasets: Deque[DremioDataset] = deque()
340
- self.sources: Deque[DremioSource] = deque()
340
+ self.sources: Deque[DremioSourceContainer] = deque()
341
341
  self.spaces: Deque[DremioSpace] = deque()
342
342
  self.folders: Deque[DremioFolder] = deque()
343
343
  self.glossary_terms: Deque[DremioGlossaryTerm] = deque()
@@ -380,12 +380,13 @@ class DremioCatalog:
380
380
  container_type = container.get("container_type")
381
381
  if container_type == DremioEntityContainerType.SOURCE:
382
382
  self.sources.append(
383
- DremioSource(
383
+ DremioSourceContainer(
384
384
  container_name=container.get("name"),
385
385
  location_id=container.get("id"),
386
386
  path=[],
387
387
  api_operations=self.dremio_api,
388
- dremio_source_type=container.get("source_type"),
388
+ dremio_source_type=container.get("source_type")
389
+ or "unknown",
389
390
  root_path=container.get("root_path"),
390
391
  database_name=container.get("database_name"),
391
392
  )
@@ -426,7 +427,7 @@ class DremioCatalog:
426
427
  self.set_containers()
427
428
  return deque(itertools.chain(self.sources, self.spaces, self.folders))
428
429
 
429
- def get_sources(self) -> Deque[DremioSource]:
430
+ def get_sources(self) -> Deque[DremioSourceContainer]:
430
431
  self.set_containers()
431
432
  return self.sources
432
433
 
@@ -1,7 +1,6 @@
1
1
  import logging
2
- import re
3
- from collections import defaultdict
4
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from dataclasses import dataclass
5
4
  from typing import Dict, Iterable, List, Optional
6
5
 
7
6
  from datahub.emitter.mce_builder import (
@@ -28,7 +27,10 @@ from datahub.ingestion.source.dremio.dremio_api import (
28
27
  DremioEdition,
29
28
  )
30
29
  from datahub.ingestion.source.dremio.dremio_aspects import DremioAspects
31
- from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
30
+ from datahub.ingestion.source.dremio.dremio_config import (
31
+ DremioSourceConfig,
32
+ DremioSourceMapping,
33
+ )
32
34
  from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
33
35
  DremioToDataHubSourceTypeMapping,
34
36
  )
@@ -39,6 +41,7 @@ from datahub.ingestion.source.dremio.dremio_entities import (
39
41
  DremioDatasetType,
40
42
  DremioGlossaryTerm,
41
43
  DremioQuery,
44
+ DremioSourceContainer,
42
45
  )
43
46
  from datahub.ingestion.source.dremio.dremio_profiling import DremioProfiler
44
47
  from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
@@ -65,6 +68,17 @@ from datahub.sql_parsing.sql_parsing_aggregator import (
65
68
  logger = logging.getLogger(__name__)
66
69
 
67
70
 
71
+ @dataclass
72
+ class DremioSourceMapEntry:
73
+ platform: str
74
+ source_name: str
75
+ dremio_source_category: str
76
+ root_path: str = ""
77
+ database_name: str = ""
78
+ platform_instance: Optional[str] = None
79
+ env: Optional[str] = None
80
+
81
+
68
82
  @platform_name("Dremio")
69
83
  @config_class(DremioSourceConfig)
70
84
  @support_status(SupportStatus.CERTIFIED)
@@ -112,7 +126,7 @@ class DremioSource(StatefulIngestionSourceBase):
112
126
  self.default_db = "dremio"
113
127
  self.config = config
114
128
  self.report = DremioSourceReport()
115
- self.source_map: Dict[str, Dict] = defaultdict()
129
+ self.source_map: Dict[str, DremioSourceMapEntry] = dict()
116
130
 
117
131
  # Initialize API operations
118
132
  dremio_api = DremioAPIOperations(self.config, self.report)
@@ -152,111 +166,12 @@ class DremioSource(StatefulIngestionSourceBase):
152
166
  def get_platform(self) -> str:
153
167
  return "dremio"
154
168
 
155
- def _build_source_map(self) -> Dict[str, Dict]:
156
- """
157
- Builds a source mapping dictionary to support external lineage generation across
158
- multiple Dremio sources, based on provided configuration mappings.
159
-
160
- This method operates as follows:
161
-
162
- 1. If a source mapping is present in the config:
163
- - For each source in the Dremio catalog, if the mapping's `source_name` matches
164
- the `dremio_source_type`, `root_path` and `database_name` are added to the mapping
165
- information, along with the platform, platform instance, and environment if they exist.
166
- This allows constructing the full URN for upstream lineage.
167
-
168
- 2. If a source mapping is absent in the configuration:
169
- - Default mappings are created for each source name, setting `env` and `platform_instance`
170
- to default values and classifying the source type. This ensures all sources have a
171
- mapping, even if specific configuration details are missing.
172
-
173
- Returns:
174
- Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
175
- (lowercased) and each value is another dictionary containing:
176
- - `platform`: The source platform.
177
- - `source_name`: The source name.
178
- - `dremio_source_type`: The type mapped to DataHub,
179
- e.g., "database", "folder".
180
- - Optional `root_path`, `database_name`, `platform_instance`,
181
- and `env` if provided in the configuration.
182
- Example:
183
- This method is used internally within the class to generate mappings before
184
- creating cross-platform lineage.
185
-
186
- """
187
-
188
- source_map = {}
169
+ def _build_source_map(self) -> Dict[str, DremioSourceMapEntry]:
189
170
  dremio_sources = self.dremio_catalog.get_sources()
171
+ source_mappings_config = self.config.source_mappings or []
190
172
 
191
- for source in dremio_sources:
192
- source_name = source.container_name
193
- if isinstance(source.dremio_source_type, str):
194
- source_type = source.dremio_source_type.lower()
195
- root_path = source.root_path.lower() if source.root_path else ""
196
- database_name = (
197
- source.database_name.lower() if source.database_name else ""
198
- )
199
- source_present = False
200
- source_platform_name = source_name
201
-
202
- for mapping in self.config.source_mappings or []:
203
- if re.search(mapping.source_name, source_type, re.IGNORECASE):
204
- source_platform_name = mapping.source_name.lower()
205
-
206
- datahub_source_type = (
207
- DremioToDataHubSourceTypeMapping.get_datahub_source_type(
208
- source_type
209
- )
210
- )
211
-
212
- if re.search(mapping.platform, datahub_source_type, re.IGNORECASE):
213
- source_platform_name = source_platform_name.lower()
214
- source_map[source_platform_name] = {
215
- "platform": mapping.platform,
216
- "source_name": mapping.source_name,
217
- "dremio_source_type": DremioToDataHubSourceTypeMapping.get_category(
218
- source_type,
219
- ),
220
- "root_path": root_path,
221
- "database_name": database_name,
222
- "platform_instance": mapping.platform_instance,
223
- "env": mapping.env,
224
- }
225
- source_present = True
226
- break
227
-
228
- if not source_present:
229
- try:
230
- dremio_source_type = (
231
- DremioToDataHubSourceTypeMapping.get_category(source_type)
232
- )
233
- except Exception as exc:
234
- logger.info(
235
- f"Source {source_type} is not a standard Dremio source type. "
236
- f"Adding source_type {source_type} to mapping as database. Error: {exc}"
237
- )
238
-
239
- DremioToDataHubSourceTypeMapping.add_mapping(
240
- source_type, source_name
241
- )
242
- dremio_source_type = (
243
- DremioToDataHubSourceTypeMapping.get_category(source_type)
244
- )
245
-
246
- source_map[source_platform_name.lower()] = {
247
- "platform": source_type,
248
- "source_name": source_name,
249
- "dremio_source_type": dremio_source_type,
250
- }
251
-
252
- else:
253
- logger.error(
254
- f'Source "{source.container_name}" is broken. Containers will not be created for source.'
255
- )
256
- logger.error(
257
- f'No new cross-platform lineage will be emitted for source "{source.container_name}".'
258
- )
259
- logger.error("Fix this source in Dremio to fix this issue.")
173
+ source_map = build_dremio_source_map(dremio_sources, source_mappings_config)
174
+ logger.info(f"Full source map: {source_map}")
260
175
 
261
176
  return source_map
262
177
 
@@ -431,6 +346,7 @@ class DremioSource(StatefulIngestionSourceBase):
431
346
  dremio_path=dataset_info.path,
432
347
  dremio_dataset=dataset_info.resource_name,
433
348
  )
349
+ logger.debug(f"Upstream dataset for {dataset_urn}: {upstream_urn}")
434
350
 
435
351
  if upstream_urn:
436
352
  upstream_lineage = UpstreamLineage(
@@ -596,25 +512,23 @@ class DremioSource(StatefulIngestionSourceBase):
596
512
  if not mapping:
597
513
  return None
598
514
 
599
- platform = mapping.get("platform")
515
+ platform = mapping.platform
600
516
  if not platform:
601
517
  return None
602
518
 
603
- platform_instance = mapping.get(
604
- "platform_instance", self.config.platform_instance
605
- )
606
- env = mapping.get("env", self.config.env)
519
+ platform_instance = mapping.platform_instance
520
+ env = mapping.env or self.config.env
607
521
 
608
522
  root_path = ""
609
523
  database_name = ""
610
524
 
611
- if mapping.get("dremio_source_type") == "file_object_storage":
612
- if mapping.get("root_path"):
613
- root_path = f"{mapping['root_path'][1:]}/"
525
+ if mapping.dremio_source_category == "file_object_storage":
526
+ if mapping.root_path:
527
+ root_path = f"{mapping.root_path[1:]}/"
614
528
  dremio_dataset = f"{root_path}{'/'.join(dremio_path[1:])}/{dremio_dataset}"
615
529
  else:
616
- if mapping.get("database_name"):
617
- database_name = f"{mapping['database_name']}."
530
+ if mapping.database_name:
531
+ database_name = f"{mapping.database_name}."
618
532
  dremio_dataset = (
619
533
  f"{database_name}{'.'.join(dremio_path[1:])}.{dremio_dataset}"
620
534
  )
@@ -639,3 +553,68 @@ class DremioSource(StatefulIngestionSourceBase):
639
553
  Get the source report.
640
554
  """
641
555
  return self.report
556
+
557
+
558
+ def build_dremio_source_map(
559
+ dremio_sources: Iterable[DremioSourceContainer],
560
+ source_mappings_config: List[DremioSourceMapping],
561
+ ) -> Dict[str, DremioSourceMapEntry]:
562
+ """
563
+ Builds a source mapping dictionary to support external lineage generation across
564
+ multiple Dremio sources, based on provided configuration mappings.
565
+
566
+ This method operates as follows:
567
+
568
+ Returns:
569
+ Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
570
+ (lowercased) and each value is another entry containing:
571
+ - `platform`: The source platform.
572
+ - `source_name`: The source name.
573
+ - `dremio_source_category`: The type mapped to DataHub,
574
+ e.g., "database", "folder".
575
+ - Optional `root_path`, `database_name`, `platform_instance`,
576
+ and `env` if provided in the configuration.
577
+ Example:
578
+ This method is used internally within the class to generate mappings before
579
+ creating cross-platform lineage.
580
+
581
+ """
582
+ source_map = {}
583
+ for source in dremio_sources:
584
+ current_source_name = source.container_name
585
+
586
+ source_type = source.dremio_source_type.lower()
587
+ source_category = DremioToDataHubSourceTypeMapping.get_category(source_type)
588
+ datahub_platform = DremioToDataHubSourceTypeMapping.get_datahub_platform(
589
+ source_type
590
+ )
591
+ root_path = source.root_path.lower() if source.root_path else ""
592
+ database_name = source.database_name.lower() if source.database_name else ""
593
+ source_present = False
594
+
595
+ for mapping in source_mappings_config:
596
+ if mapping.source_name.lower() == current_source_name.lower():
597
+ source_map[current_source_name.lower()] = DremioSourceMapEntry(
598
+ platform=mapping.platform,
599
+ source_name=mapping.source_name,
600
+ dremio_source_category=source_category,
601
+ root_path=root_path,
602
+ database_name=database_name,
603
+ platform_instance=mapping.platform_instance,
604
+ env=mapping.env,
605
+ )
606
+ source_present = True
607
+ break
608
+
609
+ if not source_present:
610
+ source_map[current_source_name.lower()] = DremioSourceMapEntry(
611
+ platform=datahub_platform,
612
+ source_name=current_source_name,
613
+ dremio_source_category=source_category,
614
+ root_path=root_path,
615
+ database_name=database_name,
616
+ platform_instance=None,
617
+ env=None,
618
+ )
619
+
620
+ return source_map
File without changes