acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/METADATA +2486 -2487
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/RECORD +64 -49
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/emitter/request_helper.py +19 -14
- datahub/ingestion/api/source.py +6 -2
- datahub/ingestion/api/source_helpers.py +6 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +15 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +62 -66
- datahub/ingestion/source/mlflow.py +198 -7
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/powerbi.py +29 -23
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/superset.py +138 -22
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +311 -2
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +30 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +22 -0
- datahub/sdk/search_filters.py +4 -4
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/testing/mcp_diff.py +1 -18
- datahub/ingestion/source/vertexai.py +0 -697
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1rc2.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,7 @@ from cached_property import cached_property
|
|
|
11
11
|
from pydantic.fields import Field
|
|
12
12
|
from wcmatch import pathlib
|
|
13
13
|
|
|
14
|
-
from datahub.configuration.common import ConfigModel
|
|
14
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
15
15
|
from datahub.ingestion.source.aws.s3_util import is_s3_uri
|
|
16
16
|
from datahub.ingestion.source.azure.abs_utils import is_abs_uri
|
|
17
17
|
from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
|
|
@@ -145,6 +145,11 @@ class PathSpec(ConfigModel):
|
|
|
145
145
|
description="Include hidden folders in the traversal (folders starting with . or _",
|
|
146
146
|
)
|
|
147
147
|
|
|
148
|
+
tables_filter_pattern: AllowDenyPattern = Field(
|
|
149
|
+
default=AllowDenyPattern.allow_all(),
|
|
150
|
+
description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.",
|
|
151
|
+
)
|
|
152
|
+
|
|
148
153
|
def is_path_hidden(self, path: str) -> bool:
|
|
149
154
|
# Split the path into directories and filename
|
|
150
155
|
dirs, filename = os.path.split(path)
|
|
@@ -177,6 +182,12 @@ class PathSpec(ConfigModel):
|
|
|
177
182
|
):
|
|
178
183
|
return False
|
|
179
184
|
logger.debug(f"{path} is not excluded")
|
|
185
|
+
|
|
186
|
+
table_name, _ = self.extract_table_name_and_path(path)
|
|
187
|
+
if not self.tables_filter_pattern.allowed(table_name):
|
|
188
|
+
return False
|
|
189
|
+
logger.debug(f"{path} is passed table name check")
|
|
190
|
+
|
|
180
191
|
ext = os.path.splitext(path)[1].strip(".")
|
|
181
192
|
|
|
182
193
|
if not ignore_ext:
|
|
@@ -218,6 +229,15 @@ class PathSpec(ConfigModel):
|
|
|
218
229
|
exclude_path.rstrip("/"), flags=pathlib.GLOBSTAR
|
|
219
230
|
):
|
|
220
231
|
return False
|
|
232
|
+
|
|
233
|
+
file_name_pattern = self.include.rsplit("/", 1)[1]
|
|
234
|
+
table_name, _ = self.extract_table_name_and_path(
|
|
235
|
+
os.path.join(path, file_name_pattern)
|
|
236
|
+
)
|
|
237
|
+
if not self.tables_filter_pattern.allowed(table_name):
|
|
238
|
+
return False
|
|
239
|
+
logger.debug(f"{path} is passed table name check")
|
|
240
|
+
|
|
221
241
|
return True
|
|
222
242
|
|
|
223
243
|
@classmethod
|
|
@@ -4,7 +4,7 @@ from abc import abstractmethod
|
|
|
4
4
|
from dataclasses import dataclass, field
|
|
5
5
|
from datetime import datetime
|
|
6
6
|
from enum import auto
|
|
7
|
-
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
|
|
7
|
+
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
8
8
|
|
|
9
9
|
import more_itertools
|
|
10
10
|
import pydantic
|
|
@@ -849,7 +849,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
849
849
|
test_nodes: List[DBTNode],
|
|
850
850
|
extra_custom_props: Dict[str, str],
|
|
851
851
|
all_nodes_map: Dict[str, DBTNode],
|
|
852
|
-
) -> Iterable[
|
|
852
|
+
) -> Iterable[MetadataChangeProposalWrapper]:
|
|
853
853
|
for node in sorted(test_nodes, key=lambda n: n.dbt_name):
|
|
854
854
|
upstreams = get_upstreams_for_test(
|
|
855
855
|
test_node=node,
|
|
@@ -902,7 +902,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
902
902
|
yield MetadataChangeProposalWrapper(
|
|
903
903
|
entityUrn=assertion_urn,
|
|
904
904
|
aspect=self._make_data_platform_instance_aspect(),
|
|
905
|
-
)
|
|
905
|
+
)
|
|
906
906
|
|
|
907
907
|
yield make_assertion_from_test(
|
|
908
908
|
custom_props,
|
|
@@ -949,7 +949,9 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
|
|
949
949
|
),
|
|
950
950
|
)
|
|
951
951
|
|
|
952
|
-
def get_workunits_internal(
|
|
952
|
+
def get_workunits_internal(
|
|
953
|
+
self,
|
|
954
|
+
) -> Iterable[Union[MetadataWorkUnit, MetadataChangeProposalWrapper]]:
|
|
953
955
|
if self.config.write_semantics == "PATCH":
|
|
954
956
|
self.ctx.require_graph("Using dbt with write_semantics=PATCH")
|
|
955
957
|
|
|
@@ -343,6 +343,9 @@ class DBTRunResult(BaseModel):
|
|
|
343
343
|
def timing_map(self) -> Dict[str, DBTRunTiming]:
|
|
344
344
|
return {x.name: x for x in self.timing if x.name}
|
|
345
345
|
|
|
346
|
+
def has_success_status(self) -> bool:
|
|
347
|
+
return self.status in ("pass", "success")
|
|
348
|
+
|
|
346
349
|
|
|
347
350
|
class DBTRunMetadata(BaseModel):
|
|
348
351
|
dbt_schema_version: str
|
|
@@ -355,12 +358,7 @@ def _parse_test_result(
|
|
|
355
358
|
dbt_metadata: DBTRunMetadata,
|
|
356
359
|
run_result: DBTRunResult,
|
|
357
360
|
) -> Optional[DBTTestResult]:
|
|
358
|
-
if run_result.
|
|
359
|
-
# This was probably a docs generate run result, so this isn't actually
|
|
360
|
-
# a test result.
|
|
361
|
-
return None
|
|
362
|
-
|
|
363
|
-
if run_result.status != "pass":
|
|
361
|
+
if not run_result.has_success_status():
|
|
364
362
|
native_results = {"message": run_result.message or ""}
|
|
365
363
|
if run_result.failures:
|
|
366
364
|
native_results.update({"failures": str(run_result.failures)})
|
|
@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
from datahub.emitter import mce_builder
|
|
8
8
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
|
-
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
10
9
|
from datahub.metadata.schema_classes import (
|
|
11
10
|
AssertionInfoClass,
|
|
12
11
|
AssertionResultClass,
|
|
@@ -43,6 +42,9 @@ class DBTTestResult:
|
|
|
43
42
|
|
|
44
43
|
native_results: Dict[str, str]
|
|
45
44
|
|
|
45
|
+
def has_success_status(self) -> bool:
|
|
46
|
+
return self.status in ("pass", "success")
|
|
47
|
+
|
|
46
48
|
|
|
47
49
|
def _get_name_for_relationship_test(kw_args: Dict[str, str]) -> Optional[str]:
|
|
48
50
|
"""
|
|
@@ -157,7 +159,7 @@ def make_assertion_from_test(
|
|
|
157
159
|
node: "DBTNode",
|
|
158
160
|
assertion_urn: str,
|
|
159
161
|
upstream_urn: str,
|
|
160
|
-
) ->
|
|
162
|
+
) -> MetadataChangeProposalWrapper:
|
|
161
163
|
assert node.test_info
|
|
162
164
|
qualified_test_name = node.test_info.qualified_test_name
|
|
163
165
|
column_name = node.test_info.column_name
|
|
@@ -231,7 +233,7 @@ def make_assertion_from_test(
|
|
|
231
233
|
return MetadataChangeProposalWrapper(
|
|
232
234
|
entityUrn=assertion_urn,
|
|
233
235
|
aspect=assertion_info,
|
|
234
|
-
)
|
|
236
|
+
)
|
|
235
237
|
|
|
236
238
|
|
|
237
239
|
def make_assertion_result_from_test(
|
|
@@ -240,7 +242,7 @@ def make_assertion_result_from_test(
|
|
|
240
242
|
assertion_urn: str,
|
|
241
243
|
upstream_urn: str,
|
|
242
244
|
test_warnings_are_errors: bool,
|
|
243
|
-
) ->
|
|
245
|
+
) -> MetadataChangeProposalWrapper:
|
|
244
246
|
assertionResult = AssertionRunEventClass(
|
|
245
247
|
timestampMillis=int(test_result.execution_time.timestamp() * 1000.0),
|
|
246
248
|
assertionUrn=assertion_urn,
|
|
@@ -249,7 +251,7 @@ def make_assertion_result_from_test(
|
|
|
249
251
|
result=AssertionResultClass(
|
|
250
252
|
type=(
|
|
251
253
|
AssertionResultTypeClass.SUCCESS
|
|
252
|
-
if test_result.
|
|
254
|
+
if test_result.has_success_status()
|
|
253
255
|
or (not test_warnings_are_errors and test_result.status == "warn")
|
|
254
256
|
else AssertionResultTypeClass.FAILURE
|
|
255
257
|
),
|
|
@@ -261,4 +263,4 @@ def make_assertion_result_from_test(
|
|
|
261
263
|
return MetadataChangeProposalWrapper(
|
|
262
264
|
entityUrn=assertion_urn,
|
|
263
265
|
aspect=assertionResult,
|
|
264
|
-
)
|
|
266
|
+
)
|
|
@@ -294,7 +294,7 @@ class DremioContainer:
|
|
|
294
294
|
)
|
|
295
295
|
|
|
296
296
|
|
|
297
|
-
class
|
|
297
|
+
class DremioSourceContainer(DremioContainer):
|
|
298
298
|
subclass: str = "Dremio Source"
|
|
299
299
|
dremio_source_type: str
|
|
300
300
|
root_path: Optional[str]
|
|
@@ -337,7 +337,7 @@ class DremioCatalog:
|
|
|
337
337
|
self.dremio_api = dremio_api
|
|
338
338
|
self.edition = dremio_api.edition
|
|
339
339
|
self.datasets: Deque[DremioDataset] = deque()
|
|
340
|
-
self.sources: Deque[
|
|
340
|
+
self.sources: Deque[DremioSourceContainer] = deque()
|
|
341
341
|
self.spaces: Deque[DremioSpace] = deque()
|
|
342
342
|
self.folders: Deque[DremioFolder] = deque()
|
|
343
343
|
self.glossary_terms: Deque[DremioGlossaryTerm] = deque()
|
|
@@ -380,12 +380,13 @@ class DremioCatalog:
|
|
|
380
380
|
container_type = container.get("container_type")
|
|
381
381
|
if container_type == DremioEntityContainerType.SOURCE:
|
|
382
382
|
self.sources.append(
|
|
383
|
-
|
|
383
|
+
DremioSourceContainer(
|
|
384
384
|
container_name=container.get("name"),
|
|
385
385
|
location_id=container.get("id"),
|
|
386
386
|
path=[],
|
|
387
387
|
api_operations=self.dremio_api,
|
|
388
|
-
dremio_source_type=container.get("source_type")
|
|
388
|
+
dremio_source_type=container.get("source_type")
|
|
389
|
+
or "unknown",
|
|
389
390
|
root_path=container.get("root_path"),
|
|
390
391
|
database_name=container.get("database_name"),
|
|
391
392
|
)
|
|
@@ -426,7 +427,7 @@ class DremioCatalog:
|
|
|
426
427
|
self.set_containers()
|
|
427
428
|
return deque(itertools.chain(self.sources, self.spaces, self.folders))
|
|
428
429
|
|
|
429
|
-
def get_sources(self) -> Deque[
|
|
430
|
+
def get_sources(self) -> Deque[DremioSourceContainer]:
|
|
430
431
|
self.set_containers()
|
|
431
432
|
return self.sources
|
|
432
433
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import re
|
|
3
|
-
from collections import defaultdict
|
|
4
2
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
3
|
+
from dataclasses import dataclass
|
|
5
4
|
from typing import Dict, Iterable, List, Optional
|
|
6
5
|
|
|
7
6
|
from datahub.emitter.mce_builder import (
|
|
@@ -28,7 +27,10 @@ from datahub.ingestion.source.dremio.dremio_api import (
|
|
|
28
27
|
DremioEdition,
|
|
29
28
|
)
|
|
30
29
|
from datahub.ingestion.source.dremio.dremio_aspects import DremioAspects
|
|
31
|
-
from datahub.ingestion.source.dremio.dremio_config import
|
|
30
|
+
from datahub.ingestion.source.dremio.dremio_config import (
|
|
31
|
+
DremioSourceConfig,
|
|
32
|
+
DremioSourceMapping,
|
|
33
|
+
)
|
|
32
34
|
from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
|
|
33
35
|
DremioToDataHubSourceTypeMapping,
|
|
34
36
|
)
|
|
@@ -39,6 +41,7 @@ from datahub.ingestion.source.dremio.dremio_entities import (
|
|
|
39
41
|
DremioDatasetType,
|
|
40
42
|
DremioGlossaryTerm,
|
|
41
43
|
DremioQuery,
|
|
44
|
+
DremioSourceContainer,
|
|
42
45
|
)
|
|
43
46
|
from datahub.ingestion.source.dremio.dremio_profiling import DremioProfiler
|
|
44
47
|
from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
|
|
@@ -65,6 +68,17 @@ from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
|
65
68
|
logger = logging.getLogger(__name__)
|
|
66
69
|
|
|
67
70
|
|
|
71
|
+
@dataclass
|
|
72
|
+
class DremioSourceMapEntry:
|
|
73
|
+
platform: str
|
|
74
|
+
source_name: str
|
|
75
|
+
dremio_source_category: str
|
|
76
|
+
root_path: str = ""
|
|
77
|
+
database_name: str = ""
|
|
78
|
+
platform_instance: Optional[str] = None
|
|
79
|
+
env: Optional[str] = None
|
|
80
|
+
|
|
81
|
+
|
|
68
82
|
@platform_name("Dremio")
|
|
69
83
|
@config_class(DremioSourceConfig)
|
|
70
84
|
@support_status(SupportStatus.CERTIFIED)
|
|
@@ -112,7 +126,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
112
126
|
self.default_db = "dremio"
|
|
113
127
|
self.config = config
|
|
114
128
|
self.report = DremioSourceReport()
|
|
115
|
-
self.source_map: Dict[str,
|
|
129
|
+
self.source_map: Dict[str, DremioSourceMapEntry] = dict()
|
|
116
130
|
|
|
117
131
|
# Initialize API operations
|
|
118
132
|
dremio_api = DremioAPIOperations(self.config, self.report)
|
|
@@ -152,111 +166,12 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
152
166
|
def get_platform(self) -> str:
|
|
153
167
|
return "dremio"
|
|
154
168
|
|
|
155
|
-
def _build_source_map(self) -> Dict[str,
|
|
156
|
-
"""
|
|
157
|
-
Builds a source mapping dictionary to support external lineage generation across
|
|
158
|
-
multiple Dremio sources, based on provided configuration mappings.
|
|
159
|
-
|
|
160
|
-
This method operates as follows:
|
|
161
|
-
|
|
162
|
-
1. If a source mapping is present in the config:
|
|
163
|
-
- For each source in the Dremio catalog, if the mapping's `source_name` matches
|
|
164
|
-
the `dremio_source_type`, `root_path` and `database_name` are added to the mapping
|
|
165
|
-
information, along with the platform, platform instance, and environment if they exist.
|
|
166
|
-
This allows constructing the full URN for upstream lineage.
|
|
167
|
-
|
|
168
|
-
2. If a source mapping is absent in the configuration:
|
|
169
|
-
- Default mappings are created for each source name, setting `env` and `platform_instance`
|
|
170
|
-
to default values and classifying the source type. This ensures all sources have a
|
|
171
|
-
mapping, even if specific configuration details are missing.
|
|
172
|
-
|
|
173
|
-
Returns:
|
|
174
|
-
Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
|
|
175
|
-
(lowercased) and each value is another dictionary containing:
|
|
176
|
-
- `platform`: The source platform.
|
|
177
|
-
- `source_name`: The source name.
|
|
178
|
-
- `dremio_source_type`: The type mapped to DataHub,
|
|
179
|
-
e.g., "database", "folder".
|
|
180
|
-
- Optional `root_path`, `database_name`, `platform_instance`,
|
|
181
|
-
and `env` if provided in the configuration.
|
|
182
|
-
Example:
|
|
183
|
-
This method is used internally within the class to generate mappings before
|
|
184
|
-
creating cross-platform lineage.
|
|
185
|
-
|
|
186
|
-
"""
|
|
187
|
-
|
|
188
|
-
source_map = {}
|
|
169
|
+
def _build_source_map(self) -> Dict[str, DremioSourceMapEntry]:
|
|
189
170
|
dremio_sources = self.dremio_catalog.get_sources()
|
|
171
|
+
source_mappings_config = self.config.source_mappings or []
|
|
190
172
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
if isinstance(source.dremio_source_type, str):
|
|
194
|
-
source_type = source.dremio_source_type.lower()
|
|
195
|
-
root_path = source.root_path.lower() if source.root_path else ""
|
|
196
|
-
database_name = (
|
|
197
|
-
source.database_name.lower() if source.database_name else ""
|
|
198
|
-
)
|
|
199
|
-
source_present = False
|
|
200
|
-
source_platform_name = source_name
|
|
201
|
-
|
|
202
|
-
for mapping in self.config.source_mappings or []:
|
|
203
|
-
if re.search(mapping.source_name, source_type, re.IGNORECASE):
|
|
204
|
-
source_platform_name = mapping.source_name.lower()
|
|
205
|
-
|
|
206
|
-
datahub_source_type = (
|
|
207
|
-
DremioToDataHubSourceTypeMapping.get_datahub_source_type(
|
|
208
|
-
source_type
|
|
209
|
-
)
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
if re.search(mapping.platform, datahub_source_type, re.IGNORECASE):
|
|
213
|
-
source_platform_name = source_platform_name.lower()
|
|
214
|
-
source_map[source_platform_name] = {
|
|
215
|
-
"platform": mapping.platform,
|
|
216
|
-
"source_name": mapping.source_name,
|
|
217
|
-
"dremio_source_type": DremioToDataHubSourceTypeMapping.get_category(
|
|
218
|
-
source_type,
|
|
219
|
-
),
|
|
220
|
-
"root_path": root_path,
|
|
221
|
-
"database_name": database_name,
|
|
222
|
-
"platform_instance": mapping.platform_instance,
|
|
223
|
-
"env": mapping.env,
|
|
224
|
-
}
|
|
225
|
-
source_present = True
|
|
226
|
-
break
|
|
227
|
-
|
|
228
|
-
if not source_present:
|
|
229
|
-
try:
|
|
230
|
-
dremio_source_type = (
|
|
231
|
-
DremioToDataHubSourceTypeMapping.get_category(source_type)
|
|
232
|
-
)
|
|
233
|
-
except Exception as exc:
|
|
234
|
-
logger.info(
|
|
235
|
-
f"Source {source_type} is not a standard Dremio source type. "
|
|
236
|
-
f"Adding source_type {source_type} to mapping as database. Error: {exc}"
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
DremioToDataHubSourceTypeMapping.add_mapping(
|
|
240
|
-
source_type, source_name
|
|
241
|
-
)
|
|
242
|
-
dremio_source_type = (
|
|
243
|
-
DremioToDataHubSourceTypeMapping.get_category(source_type)
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
source_map[source_platform_name.lower()] = {
|
|
247
|
-
"platform": source_type,
|
|
248
|
-
"source_name": source_name,
|
|
249
|
-
"dremio_source_type": dremio_source_type,
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
else:
|
|
253
|
-
logger.error(
|
|
254
|
-
f'Source "{source.container_name}" is broken. Containers will not be created for source.'
|
|
255
|
-
)
|
|
256
|
-
logger.error(
|
|
257
|
-
f'No new cross-platform lineage will be emitted for source "{source.container_name}".'
|
|
258
|
-
)
|
|
259
|
-
logger.error("Fix this source in Dremio to fix this issue.")
|
|
173
|
+
source_map = build_dremio_source_map(dremio_sources, source_mappings_config)
|
|
174
|
+
logger.info(f"Full source map: {source_map}")
|
|
260
175
|
|
|
261
176
|
return source_map
|
|
262
177
|
|
|
@@ -431,6 +346,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
431
346
|
dremio_path=dataset_info.path,
|
|
432
347
|
dremio_dataset=dataset_info.resource_name,
|
|
433
348
|
)
|
|
349
|
+
logger.debug(f"Upstream dataset for {dataset_urn}: {upstream_urn}")
|
|
434
350
|
|
|
435
351
|
if upstream_urn:
|
|
436
352
|
upstream_lineage = UpstreamLineage(
|
|
@@ -596,25 +512,23 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
596
512
|
if not mapping:
|
|
597
513
|
return None
|
|
598
514
|
|
|
599
|
-
platform = mapping.
|
|
515
|
+
platform = mapping.platform
|
|
600
516
|
if not platform:
|
|
601
517
|
return None
|
|
602
518
|
|
|
603
|
-
platform_instance = mapping.
|
|
604
|
-
|
|
605
|
-
)
|
|
606
|
-
env = mapping.get("env", self.config.env)
|
|
519
|
+
platform_instance = mapping.platform_instance
|
|
520
|
+
env = mapping.env or self.config.env
|
|
607
521
|
|
|
608
522
|
root_path = ""
|
|
609
523
|
database_name = ""
|
|
610
524
|
|
|
611
|
-
if mapping.
|
|
612
|
-
if mapping.
|
|
613
|
-
root_path = f"{mapping
|
|
525
|
+
if mapping.dremio_source_category == "file_object_storage":
|
|
526
|
+
if mapping.root_path:
|
|
527
|
+
root_path = f"{mapping.root_path[1:]}/"
|
|
614
528
|
dremio_dataset = f"{root_path}{'/'.join(dremio_path[1:])}/{dremio_dataset}"
|
|
615
529
|
else:
|
|
616
|
-
if mapping.
|
|
617
|
-
database_name = f"{mapping
|
|
530
|
+
if mapping.database_name:
|
|
531
|
+
database_name = f"{mapping.database_name}."
|
|
618
532
|
dremio_dataset = (
|
|
619
533
|
f"{database_name}{'.'.join(dremio_path[1:])}.{dremio_dataset}"
|
|
620
534
|
)
|
|
@@ -639,3 +553,68 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
639
553
|
Get the source report.
|
|
640
554
|
"""
|
|
641
555
|
return self.report
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def build_dremio_source_map(
|
|
559
|
+
dremio_sources: Iterable[DremioSourceContainer],
|
|
560
|
+
source_mappings_config: List[DremioSourceMapping],
|
|
561
|
+
) -> Dict[str, DremioSourceMapEntry]:
|
|
562
|
+
"""
|
|
563
|
+
Builds a source mapping dictionary to support external lineage generation across
|
|
564
|
+
multiple Dremio sources, based on provided configuration mappings.
|
|
565
|
+
|
|
566
|
+
This method operates as follows:
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
|
|
570
|
+
(lowercased) and each value is another entry containing:
|
|
571
|
+
- `platform`: The source platform.
|
|
572
|
+
- `source_name`: The source name.
|
|
573
|
+
- `dremio_source_category`: The type mapped to DataHub,
|
|
574
|
+
e.g., "database", "folder".
|
|
575
|
+
- Optional `root_path`, `database_name`, `platform_instance`,
|
|
576
|
+
and `env` if provided in the configuration.
|
|
577
|
+
Example:
|
|
578
|
+
This method is used internally within the class to generate mappings before
|
|
579
|
+
creating cross-platform lineage.
|
|
580
|
+
|
|
581
|
+
"""
|
|
582
|
+
source_map = {}
|
|
583
|
+
for source in dremio_sources:
|
|
584
|
+
current_source_name = source.container_name
|
|
585
|
+
|
|
586
|
+
source_type = source.dremio_source_type.lower()
|
|
587
|
+
source_category = DremioToDataHubSourceTypeMapping.get_category(source_type)
|
|
588
|
+
datahub_platform = DremioToDataHubSourceTypeMapping.get_datahub_platform(
|
|
589
|
+
source_type
|
|
590
|
+
)
|
|
591
|
+
root_path = source.root_path.lower() if source.root_path else ""
|
|
592
|
+
database_name = source.database_name.lower() if source.database_name else ""
|
|
593
|
+
source_present = False
|
|
594
|
+
|
|
595
|
+
for mapping in source_mappings_config:
|
|
596
|
+
if mapping.source_name.lower() == current_source_name.lower():
|
|
597
|
+
source_map[current_source_name.lower()] = DremioSourceMapEntry(
|
|
598
|
+
platform=mapping.platform,
|
|
599
|
+
source_name=mapping.source_name,
|
|
600
|
+
dremio_source_category=source_category,
|
|
601
|
+
root_path=root_path,
|
|
602
|
+
database_name=database_name,
|
|
603
|
+
platform_instance=mapping.platform_instance,
|
|
604
|
+
env=mapping.env,
|
|
605
|
+
)
|
|
606
|
+
source_present = True
|
|
607
|
+
break
|
|
608
|
+
|
|
609
|
+
if not source_present:
|
|
610
|
+
source_map[current_source_name.lower()] = DremioSourceMapEntry(
|
|
611
|
+
platform=datahub_platform,
|
|
612
|
+
source_name=current_source_name,
|
|
613
|
+
dremio_source_category=source_category,
|
|
614
|
+
root_path=root_path,
|
|
615
|
+
database_name=database_name,
|
|
616
|
+
platform_instance=None,
|
|
617
|
+
env=None,
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
return source_map
|
|
File without changes
|