acryl-datahub 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2568 -2626
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +120 -113
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +6 -3
- datahub/api/entities/dataset/dataset.py +9 -18
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/docker_check.py +2 -2
- datahub/configuration/common.py +29 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/pydantic_migration_helpers.py +0 -9
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +5 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/ingestion/autogenerated/capability_summary.json +45 -1
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/source/azure/azure_common.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -0
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
- datahub/ingestion/source/datahub/config.py +8 -9
- datahub/ingestion/source/dbt/dbt_common.py +65 -5
- datahub/ingestion/source/delta_lake/config.py +1 -1
- datahub/ingestion/source/dremio/dremio_config.py +3 -4
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +19 -2
- datahub/ingestion/source/ge_data_profiler.py +15 -2
- datahub/ingestion/source/ge_profiling_config.py +26 -22
- datahub/ingestion/source/grafana/grafana_config.py +2 -2
- datahub/ingestion/source/grafana/models.py +12 -14
- datahub/ingestion/source/hex/hex.py +6 -1
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/looker/looker_common.py +76 -75
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_source.py +493 -547
- datahub/ingestion/source/looker/lookml_config.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +46 -88
- datahub/ingestion/source/metabase.py +9 -2
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +1 -1
- datahub/ingestion/source/mode.py +13 -5
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +14 -21
- datahub/ingestion/source/preset.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +6 -3
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/s3/source.py +26 -24
- datahub/ingestion/source/salesforce.py +13 -9
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
- datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +12 -7
- datahub/ingestion/source/sql/cockroachdb.py +5 -3
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +7 -9
- datahub/ingestion/source/sql/mssql/source.py +2 -2
- datahub/ingestion/source/sql/mysql.py +2 -2
- datahub/ingestion/source/sql/oracle.py +3 -3
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/teradata.py +4 -4
- datahub/ingestion/source/sql/trino.py +2 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +1 -1
- datahub/ingestion/source/sql_queries.py +6 -6
- datahub/ingestion/source/state/checkpoint.py +5 -1
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
- datahub/ingestion/source/superset.py +122 -15
- datahub/ingestion/source/tableau/tableau.py +68 -14
- datahub/ingestion/source/tableau/tableau_common.py +5 -0
- datahub/ingestion/source/tableau/tableau_constant.py +1 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +7 -3
- datahub/ingestion/source/usage/usage_common.py +3 -3
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/metadata/_internal_schema_classes.py +728 -528
- datahub/metadata/_urns/urn_defs.py +1702 -1702
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/schema.avsc +17434 -17732
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
- datahub/metadata/schemas/LogicalParent.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
- datahub/sdk/_shared.py +126 -0
- datahub/sdk/chart.py +87 -30
- datahub/sdk/dashboard.py +79 -34
- datahub/sdk/entity_client.py +11 -4
- datahub/sdk/lineage_client.py +3 -3
- datahub/sdk/search_filters.py +1 -7
- datahub/sql_parsing/split_statements.py +13 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0
|
@@ -122,7 +122,7 @@ class LookMLSourceConfig(
|
|
|
122
122
|
description="List of regex patterns for LookML views to include in the extraction.",
|
|
123
123
|
)
|
|
124
124
|
parse_table_names_from_sql: bool = Field(True, description="See note below.")
|
|
125
|
-
api: Optional[LookerAPIConfig]
|
|
125
|
+
api: Optional[LookerAPIConfig] = None
|
|
126
126
|
project_name: Optional[str] = Field(
|
|
127
127
|
None,
|
|
128
128
|
description="Required if you don't specify the `api` section. The project name within which all the model "
|
|
@@ -4,7 +4,7 @@ import tempfile
|
|
|
4
4
|
from collections import OrderedDict
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from datetime import datetime, timezone
|
|
7
|
-
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
|
7
|
+
from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
|
|
8
8
|
|
|
9
9
|
import lkml
|
|
10
10
|
import lkml.simple
|
|
@@ -12,8 +12,7 @@ from looker_sdk.error import SDKError
|
|
|
12
12
|
|
|
13
13
|
from datahub.configuration.git import GitInfo
|
|
14
14
|
from datahub.emitter.mce_builder import make_schema_field_urn
|
|
15
|
-
from datahub.emitter.
|
|
16
|
-
from datahub.emitter.mcp_builder import gen_containers
|
|
15
|
+
from datahub.emitter.mcp_builder import mcps_from_mce
|
|
17
16
|
from datahub.ingestion.api.common import PipelineContext
|
|
18
17
|
from datahub.ingestion.api.decorators import (
|
|
19
18
|
SupportStatus,
|
|
@@ -77,7 +76,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
|
77
76
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
78
77
|
StatefulIngestionSourceBase,
|
|
79
78
|
)
|
|
80
|
-
from datahub.metadata.com.linkedin.pegasus2avro.common import
|
|
79
|
+
from datahub.metadata.com.linkedin.pegasus2avro.common import Status
|
|
81
80
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
82
81
|
DatasetLineageTypeClass,
|
|
83
82
|
FineGrainedLineageDownstreamType,
|
|
@@ -85,18 +84,15 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
|
|
85
84
|
UpstreamLineage,
|
|
86
85
|
ViewProperties,
|
|
87
86
|
)
|
|
88
|
-
from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
|
|
89
|
-
from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
|
|
90
87
|
from datahub.metadata.schema_classes import (
|
|
91
88
|
AuditStampClass,
|
|
92
|
-
BrowsePathEntryClass,
|
|
93
|
-
BrowsePathsV2Class,
|
|
94
|
-
ContainerClass,
|
|
95
89
|
DatasetPropertiesClass,
|
|
96
90
|
FineGrainedLineageClass,
|
|
97
91
|
FineGrainedLineageUpstreamTypeClass,
|
|
98
|
-
SubTypesClass,
|
|
99
92
|
)
|
|
93
|
+
from datahub.sdk.container import Container
|
|
94
|
+
from datahub.sdk.dataset import Dataset
|
|
95
|
+
from datahub.sdk.entity import Entity
|
|
100
96
|
from datahub.sql_parsing.sqlglot_lineage import ColumnRef
|
|
101
97
|
|
|
102
98
|
VIEW_LANGUAGE_LOOKML: str = "lookml"
|
|
@@ -428,69 +424,40 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
428
424
|
|
|
429
425
|
return dataset_props
|
|
430
426
|
|
|
431
|
-
def
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
subTypeEvent = MetadataChangeProposalWrapper(
|
|
437
|
-
entityUrn=view_urn,
|
|
438
|
-
aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW]),
|
|
439
|
-
)
|
|
440
|
-
events = [subTypeEvent]
|
|
427
|
+
def _build_dataset_entities(self, looker_view: LookerView) -> Iterable[Dataset]:
|
|
428
|
+
dataset_extra_aspects: List[Union[ViewProperties, Status]] = [
|
|
429
|
+
Status(removed=False)
|
|
430
|
+
]
|
|
441
431
|
if looker_view.view_details is not None:
|
|
442
|
-
|
|
443
|
-
entityUrn=view_urn,
|
|
444
|
-
aspect=looker_view.view_details,
|
|
445
|
-
)
|
|
446
|
-
events.append(viewEvent)
|
|
447
|
-
|
|
448
|
-
project_key = gen_project_key(self.source_config, looker_view.id.project_name)
|
|
449
|
-
|
|
450
|
-
container = ContainerClass(container=project_key.as_urn())
|
|
451
|
-
events.append(
|
|
452
|
-
MetadataChangeProposalWrapper(entityUrn=view_urn, aspect=container)
|
|
453
|
-
)
|
|
454
|
-
|
|
455
|
-
events.append(
|
|
456
|
-
MetadataChangeProposalWrapper(
|
|
457
|
-
entityUrn=view_urn,
|
|
458
|
-
aspect=looker_view.id.get_browse_path_v2(self.source_config),
|
|
459
|
-
)
|
|
460
|
-
)
|
|
461
|
-
|
|
462
|
-
return events
|
|
463
|
-
|
|
464
|
-
def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent:
|
|
465
|
-
"""
|
|
466
|
-
Creates MetadataChangeEvent for the dataset, creating upstream lineage links
|
|
467
|
-
"""
|
|
468
|
-
logger.debug(f"looker_view = {looker_view.id}")
|
|
432
|
+
dataset_extra_aspects.append(looker_view.view_details)
|
|
469
433
|
|
|
470
|
-
dataset_snapshot = DatasetSnapshot(
|
|
471
|
-
urn=looker_view.id.get_urn(self.source_config),
|
|
472
|
-
aspects=[], # we append to this list later on
|
|
473
|
-
)
|
|
474
|
-
browse_paths = BrowsePaths(
|
|
475
|
-
paths=[looker_view.id.get_browse_path(self.source_config)]
|
|
476
|
-
)
|
|
477
|
-
|
|
478
|
-
dataset_snapshot.aspects.append(browse_paths)
|
|
479
|
-
dataset_snapshot.aspects.append(Status(removed=False))
|
|
480
|
-
upstream_lineage = self._get_upstream_lineage(looker_view)
|
|
481
|
-
if upstream_lineage is not None:
|
|
482
|
-
dataset_snapshot.aspects.append(upstream_lineage)
|
|
483
434
|
schema_metadata = LookerUtil._get_schema(
|
|
484
435
|
self.source_config.platform_name,
|
|
485
436
|
looker_view.id.view_name,
|
|
486
437
|
looker_view.fields,
|
|
487
438
|
self.reporter,
|
|
488
439
|
)
|
|
489
|
-
if schema_metadata is not None:
|
|
490
|
-
dataset_snapshot.aspects.append(schema_metadata)
|
|
491
|
-
dataset_snapshot.aspects.append(self._get_custom_properties(looker_view))
|
|
492
440
|
|
|
493
|
-
|
|
441
|
+
custom_properties: DatasetPropertiesClass = self._get_custom_properties(
|
|
442
|
+
looker_view
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
yield Dataset(
|
|
446
|
+
platform=self.source_config.platform_name,
|
|
447
|
+
name=looker_view.id.get_view_dataset_name(self.source_config),
|
|
448
|
+
display_name=looker_view.id.view_name,
|
|
449
|
+
platform_instance=self.source_config.platform_instance,
|
|
450
|
+
env=self.source_config.env,
|
|
451
|
+
subtype=DatasetSubTypes.VIEW,
|
|
452
|
+
parent_container=looker_view.id.get_view_dataset_parent_container(
|
|
453
|
+
self.source_config
|
|
454
|
+
),
|
|
455
|
+
schema=schema_metadata,
|
|
456
|
+
custom_properties=custom_properties.customProperties,
|
|
457
|
+
external_url=custom_properties.externalUrl,
|
|
458
|
+
upstreams=self._get_upstream_lineage(looker_view),
|
|
459
|
+
extra_aspects=dataset_extra_aspects,
|
|
460
|
+
)
|
|
494
461
|
|
|
495
462
|
def get_project_name(self, model_name: str) -> str:
|
|
496
463
|
if self.source_config.project_name is not None:
|
|
@@ -554,7 +521,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
554
521
|
).workunit_processor,
|
|
555
522
|
]
|
|
556
523
|
|
|
557
|
-
def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
|
|
524
|
+
def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
|
|
558
525
|
with tempfile.TemporaryDirectory("lookml_tmp") as tmp_dir:
|
|
559
526
|
# Clone the base_folder if necessary.
|
|
560
527
|
if not self.source_config.base_folder:
|
|
@@ -715,7 +682,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
715
682
|
tmp_dir, project, project_visited, manifest_constants
|
|
716
683
|
)
|
|
717
684
|
|
|
718
|
-
def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
|
|
685
|
+
def get_internal_workunits(self) -> Iterable[Union[MetadataWorkUnit, Entity]]: # noqa: C901
|
|
719
686
|
assert self.source_config.base_folder
|
|
720
687
|
viewfile_loader = LookerViewFileLoader(
|
|
721
688
|
self.source_config.project_name,
|
|
@@ -949,7 +916,7 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
949
916
|
maybe_looker_view.id.project_name
|
|
950
917
|
not in self.processed_projects
|
|
951
918
|
):
|
|
952
|
-
yield from self.
|
|
919
|
+
yield from self.gen_project_containers(
|
|
953
920
|
maybe_looker_view.id.project_name
|
|
954
921
|
)
|
|
955
922
|
|
|
@@ -957,15 +924,10 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
957
924
|
maybe_looker_view.id.project_name
|
|
958
925
|
)
|
|
959
926
|
|
|
960
|
-
|
|
927
|
+
yield from self._build_dataset_entities(
|
|
961
928
|
maybe_looker_view
|
|
962
|
-
):
|
|
963
|
-
yield mcp.as_workunit()
|
|
964
|
-
mce = self._build_dataset_mce(maybe_looker_view)
|
|
965
|
-
yield MetadataWorkUnit(
|
|
966
|
-
id=f"lookml-view-{maybe_looker_view.id}",
|
|
967
|
-
mce=mce,
|
|
968
929
|
)
|
|
930
|
+
|
|
969
931
|
processed_view_files.add(include.include)
|
|
970
932
|
else:
|
|
971
933
|
(
|
|
@@ -994,28 +956,24 @@ class LookMLSource(StatefulIngestionSourceBase):
|
|
|
994
956
|
self.source_config.tag_measures_and_dimensions
|
|
995
957
|
and self.reporter.events_produced != 0
|
|
996
958
|
):
|
|
997
|
-
# Emit tag MCEs for measures and dimensions:
|
|
959
|
+
# Emit tag MCEs for measures and dimensions if we produced any explores:
|
|
998
960
|
for tag_mce in LookerUtil.get_tag_mces():
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
961
|
+
# Convert MCE to MCPs
|
|
962
|
+
for mcp in mcps_from_mce(tag_mce):
|
|
963
|
+
yield mcp.as_workunit()
|
|
1002
964
|
|
|
1003
|
-
def
|
|
965
|
+
def gen_project_containers(self, project_name: str) -> Iterable[Container]:
|
|
1004
966
|
project_key = gen_project_key(
|
|
1005
967
|
self.source_config,
|
|
1006
968
|
project_name,
|
|
1007
969
|
)
|
|
1008
|
-
|
|
970
|
+
|
|
971
|
+
yield Container(
|
|
1009
972
|
container_key=project_key,
|
|
1010
|
-
|
|
1011
|
-
|
|
973
|
+
display_name=project_name,
|
|
974
|
+
subtype=BIContainerSubTypes.LOOKML_PROJECT,
|
|
975
|
+
parent_container=["Folders"],
|
|
1012
976
|
)
|
|
1013
|
-
yield MetadataChangeProposalWrapper(
|
|
1014
|
-
entityUrn=project_key.as_urn(),
|
|
1015
|
-
aspect=BrowsePathsV2Class(
|
|
1016
|
-
path=[BrowsePathEntryClass("Folders")],
|
|
1017
|
-
),
|
|
1018
|
-
).as_workunit()
|
|
1019
977
|
|
|
1020
978
|
def report_skipped_unreachable_views(
|
|
1021
979
|
self,
|
|
@@ -13,7 +13,10 @@ from pydantic import Field, root_validator, validator
|
|
|
13
13
|
from requests.models import HTTPError
|
|
14
14
|
|
|
15
15
|
import datahub.emitter.mce_builder as builder
|
|
16
|
-
from datahub.configuration.source_common import
|
|
16
|
+
from datahub.configuration.source_common import (
|
|
17
|
+
DatasetLineageProviderConfigBase,
|
|
18
|
+
LowerCaseDatasetUrnConfigMixin,
|
|
19
|
+
)
|
|
17
20
|
from datahub.ingestion.api.common import PipelineContext
|
|
18
21
|
from datahub.ingestion.api.decorators import (
|
|
19
22
|
SourceCapability,
|
|
@@ -61,7 +64,11 @@ logger = logging.getLogger(__name__)
|
|
|
61
64
|
DATASOURCE_URN_RECURSION_LIMIT = 5
|
|
62
65
|
|
|
63
66
|
|
|
64
|
-
class MetabaseConfig(
|
|
67
|
+
class MetabaseConfig(
|
|
68
|
+
DatasetLineageProviderConfigBase,
|
|
69
|
+
StatefulIngestionConfigBase,
|
|
70
|
+
LowerCaseDatasetUrnConfigMixin,
|
|
71
|
+
):
|
|
65
72
|
# See the Metabase /api/session endpoint for details
|
|
66
73
|
# https://www.metabase.com/docs/latest/api-documentation.html#post-apisession
|
|
67
74
|
connect_uri: str = Field(default="localhost:3000", description="Metabase host URL.")
|
|
@@ -5,11 +5,11 @@ import time
|
|
|
5
5
|
from dataclasses import dataclass, field
|
|
6
6
|
from typing import Any, Dict, Iterable, List, Optional, TypeVar, Union
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
import pydantic
|
|
9
9
|
from pydantic.fields import Field
|
|
10
10
|
|
|
11
11
|
import datahub.metadata.schema_classes as models
|
|
12
|
-
from datahub.configuration.common import ConfigModel
|
|
12
|
+
from datahub.configuration.common import ConfigModel, LaxStr
|
|
13
13
|
from datahub.configuration.config_loader import load_config_file
|
|
14
14
|
from datahub.emitter.mce_builder import (
|
|
15
15
|
datahub_guid,
|
|
@@ -66,7 +66,7 @@ class GlossaryTermConfig(ConfigModel):
|
|
|
66
66
|
contains: Optional[List[str]] = None
|
|
67
67
|
values: Optional[List[str]] = None
|
|
68
68
|
related_terms: Optional[List[str]] = None
|
|
69
|
-
custom_properties: Optional[Dict[str,
|
|
69
|
+
custom_properties: Optional[Dict[str, LaxStr]] = None
|
|
70
70
|
knowledge_links: Optional[List[KnowledgeCard]] = None
|
|
71
71
|
domain: Optional[str] = None
|
|
72
72
|
|
|
@@ -82,7 +82,7 @@ class GlossaryNodeConfig(ConfigModel):
|
|
|
82
82
|
terms: Optional[List["GlossaryTermConfig"]] = None
|
|
83
83
|
nodes: Optional[List["GlossaryNodeConfig"]] = None
|
|
84
84
|
knowledge_links: Optional[List[KnowledgeCard]] = None
|
|
85
|
-
custom_properties: Optional[Dict[str,
|
|
85
|
+
custom_properties: Optional[Dict[str, LaxStr]] = None
|
|
86
86
|
|
|
87
87
|
# Private fields.
|
|
88
88
|
_urn: str
|
|
@@ -108,12 +108,12 @@ class BusinessGlossarySourceConfig(ConfigModel):
|
|
|
108
108
|
|
|
109
109
|
|
|
110
110
|
class BusinessGlossaryConfig(DefaultConfig):
|
|
111
|
-
version:
|
|
111
|
+
version: LaxStr
|
|
112
112
|
terms: Optional[List["GlossaryTermConfig"]] = None
|
|
113
113
|
nodes: Optional[List["GlossaryNodeConfig"]] = None
|
|
114
114
|
|
|
115
|
-
@
|
|
116
|
-
def version_must_be_1(cls, v):
|
|
115
|
+
@pydantic.field_validator("version", mode="after")
|
|
116
|
+
def version_must_be_1(cls, v: str) -> str:
|
|
117
117
|
if v != "1":
|
|
118
118
|
raise ValueError("Only version 1 is supported")
|
|
119
119
|
return v
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -7,7 +7,16 @@ from dataclasses import dataclass
|
|
|
7
7
|
from datetime import datetime, timezone
|
|
8
8
|
from functools import lru_cache
|
|
9
9
|
from json import JSONDecodeError
|
|
10
|
-
from typing import
|
|
10
|
+
from typing import (
|
|
11
|
+
Dict,
|
|
12
|
+
Iterable,
|
|
13
|
+
Iterator,
|
|
14
|
+
List,
|
|
15
|
+
Optional,
|
|
16
|
+
Set,
|
|
17
|
+
Tuple,
|
|
18
|
+
Union,
|
|
19
|
+
)
|
|
11
20
|
|
|
12
21
|
import dateutil.parser as dp
|
|
13
22
|
import psutil
|
|
@@ -24,7 +33,7 @@ from requests.models import HTTPBasicAuth, HTTPError
|
|
|
24
33
|
from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
|
|
25
34
|
|
|
26
35
|
import datahub.emitter.mce_builder as builder
|
|
27
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
36
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
28
37
|
from datahub.configuration.source_common import (
|
|
29
38
|
DatasetLineageProviderConfigBase,
|
|
30
39
|
)
|
|
@@ -200,10 +209,9 @@ class ModeConfig(
|
|
|
200
209
|
default=True, description="Tag measures and dimensions in the schema"
|
|
201
210
|
)
|
|
202
211
|
|
|
203
|
-
items_per_page: int = Field(
|
|
204
|
-
|
|
212
|
+
items_per_page: HiddenFromDocs[int] = Field(
|
|
213
|
+
DEFAULT_API_ITEMS_PER_PAGE,
|
|
205
214
|
description="Number of items per page for paginated API requests.",
|
|
206
|
-
hidden_from_docs=True,
|
|
207
215
|
)
|
|
208
216
|
|
|
209
217
|
@validator("connect_uri")
|
datahub/ingestion/source/nifi.py
CHANGED
|
@@ -166,7 +166,7 @@ class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
|
|
|
166
166
|
)
|
|
167
167
|
|
|
168
168
|
@root_validator(skip_on_failure=True)
|
|
169
|
-
def validate_auth_params(
|
|
169
|
+
def validate_auth_params(cls, values):
|
|
170
170
|
if values.get("auth") is NifiAuthType.CLIENT_CERT and not values.get(
|
|
171
171
|
"client_cert_file"
|
|
172
172
|
):
|
|
@@ -4,11 +4,10 @@ from enum import Enum
|
|
|
4
4
|
from typing import Dict, List, Literal, Optional, Union
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
|
-
from pydantic import validator
|
|
8
|
-
from pydantic.class_validators import root_validator
|
|
7
|
+
from pydantic import root_validator, validator
|
|
9
8
|
|
|
10
9
|
import datahub.emitter.mce_builder as builder
|
|
11
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
12
11
|
from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
|
|
13
12
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
14
13
|
from datahub.ingestion.api.incremental_lineage_helper import (
|
|
@@ -291,22 +290,18 @@ class PowerBiProfilingConfig(ConfigModel):
|
|
|
291
290
|
class PowerBiDashboardSourceConfig(
|
|
292
291
|
StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
|
|
293
292
|
):
|
|
294
|
-
platform_name: str = pydantic.Field(
|
|
295
|
-
default=Constant.PLATFORM_NAME, hidden_from_docs=True
|
|
296
|
-
)
|
|
293
|
+
platform_name: HiddenFromDocs[str] = pydantic.Field(default=Constant.PLATFORM_NAME)
|
|
297
294
|
|
|
298
|
-
platform_urn: str = pydantic.Field(
|
|
295
|
+
platform_urn: HiddenFromDocs[str] = pydantic.Field(
|
|
299
296
|
default=builder.make_data_platform_urn(platform=Constant.PLATFORM_NAME),
|
|
300
|
-
hidden_from_docs=True,
|
|
301
297
|
)
|
|
302
298
|
|
|
303
299
|
# Organization Identifier
|
|
304
300
|
tenant_id: str = pydantic.Field(description="PowerBI tenant identifier")
|
|
305
301
|
# PowerBi workspace identifier
|
|
306
|
-
workspace_id: Optional[str] = pydantic.Field(
|
|
302
|
+
workspace_id: HiddenFromDocs[Optional[str]] = pydantic.Field(
|
|
307
303
|
default=None,
|
|
308
304
|
description="[deprecated] Use workspace_id_pattern instead",
|
|
309
|
-
hidden_from_docs=True,
|
|
310
305
|
)
|
|
311
306
|
# PowerBi workspace identifier
|
|
312
307
|
workspace_id_pattern: AllowDenyPattern = pydantic.Field(
|
|
@@ -326,15 +321,14 @@ class PowerBiDashboardSourceConfig(
|
|
|
326
321
|
# Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
|
|
327
322
|
# DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
|
|
328
323
|
# mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
|
|
329
|
-
dataset_type_mapping:
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
)
|
|
324
|
+
dataset_type_mapping: HiddenFromDocs[
|
|
325
|
+
Union[Dict[str, str], Dict[str, PlatformDetail]]
|
|
326
|
+
] = pydantic.Field(
|
|
327
|
+
default_factory=default_for_dataset_type_mapping,
|
|
328
|
+
description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
|
|
329
|
+
"DataHub supported datasources."
|
|
330
|
+
"You can configured platform instance for dataset lineage. "
|
|
331
|
+
"See Quickstart Recipe for mapping",
|
|
338
332
|
)
|
|
339
333
|
# PowerBI datasource's server to platform instance mapping
|
|
340
334
|
server_to_platform_instance: Dict[
|
|
@@ -541,10 +535,9 @@ class PowerBiDashboardSourceConfig(
|
|
|
541
535
|
"Increase this value if you encounter the 'M-Query Parsing Timeout' message in the connector report.",
|
|
542
536
|
)
|
|
543
537
|
|
|
544
|
-
metadata_api_timeout: int = pydantic.Field(
|
|
538
|
+
metadata_api_timeout: HiddenFromDocs[int] = pydantic.Field(
|
|
545
539
|
default=30,
|
|
546
540
|
description="timeout in seconds for Metadata Rest Api.",
|
|
547
|
-
hidden_from_docs=True,
|
|
548
541
|
)
|
|
549
542
|
|
|
550
543
|
@root_validator(skip_on_failure=True)
|
|
@@ -2,7 +2,7 @@ import logging
|
|
|
2
2
|
from typing import Dict, Optional
|
|
3
3
|
|
|
4
4
|
import requests
|
|
5
|
-
from pydantic
|
|
5
|
+
from pydantic import root_validator, validator
|
|
6
6
|
from pydantic.fields import Field
|
|
7
7
|
|
|
8
8
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
1
2
|
from datetime import datetime
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import Dict, List, Optional, Type, Union
|
|
4
5
|
|
|
5
|
-
from pydantic import BaseModel, Field, root_validator
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field, root_validator
|
|
6
7
|
|
|
7
8
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
8
9
|
from datahub.ingestion.source.qlik_sense.config import QLIK_DATETIME_FORMAT, Constant
|
|
@@ -78,7 +79,11 @@ PERSONAL_SPACE_DICT = {
|
|
|
78
79
|
}
|
|
79
80
|
|
|
80
81
|
|
|
81
|
-
class
|
|
82
|
+
class _QlikBaseModel(BaseModel):
|
|
83
|
+
model_config = ConfigDict(coerce_numbers_to_str=True)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class Space(_QlikBaseModel):
|
|
82
87
|
id: str
|
|
83
88
|
name: str
|
|
84
89
|
description: str
|
|
@@ -89,6 +94,9 @@ class Space(BaseModel):
|
|
|
89
94
|
|
|
90
95
|
@root_validator(pre=True)
|
|
91
96
|
def update_values(cls, values: Dict) -> Dict:
|
|
97
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
98
|
+
values = deepcopy(values)
|
|
99
|
+
|
|
92
100
|
values[Constant.CREATEDAT] = datetime.strptime(
|
|
93
101
|
values[Constant.CREATEDAT], QLIK_DATETIME_FORMAT
|
|
94
102
|
)
|
|
@@ -98,7 +106,7 @@ class Space(BaseModel):
|
|
|
98
106
|
return values
|
|
99
107
|
|
|
100
108
|
|
|
101
|
-
class Item(
|
|
109
|
+
class Item(_QlikBaseModel):
|
|
102
110
|
id: str
|
|
103
111
|
description: str = ""
|
|
104
112
|
ownerId: str
|
|
@@ -107,7 +115,7 @@ class Item(BaseModel):
|
|
|
107
115
|
updatedAt: datetime
|
|
108
116
|
|
|
109
117
|
|
|
110
|
-
class SchemaField(
|
|
118
|
+
class SchemaField(_QlikBaseModel):
|
|
111
119
|
name: str
|
|
112
120
|
dataType: Optional[str] = None
|
|
113
121
|
primaryKey: Optional[bool] = None
|
|
@@ -115,6 +123,8 @@ class SchemaField(BaseModel):
|
|
|
115
123
|
|
|
116
124
|
@root_validator(pre=True)
|
|
117
125
|
def update_values(cls, values: Dict) -> Dict:
|
|
126
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
127
|
+
values = deepcopy(values)
|
|
118
128
|
values[Constant.DATATYPE] = values.get(Constant.DATATYPE, {}).get(Constant.TYPE)
|
|
119
129
|
return values
|
|
120
130
|
|
|
@@ -130,6 +140,8 @@ class QlikDataset(Item):
|
|
|
130
140
|
|
|
131
141
|
@root_validator(pre=True)
|
|
132
142
|
def update_values(cls, values: Dict) -> Dict:
|
|
143
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
144
|
+
values = deepcopy(values)
|
|
133
145
|
# Update str time to datetime
|
|
134
146
|
values[Constant.CREATEDAT] = datetime.strptime(
|
|
135
147
|
values[Constant.CREATEDTIME], QLIK_DATETIME_FORMAT
|
|
@@ -148,13 +160,13 @@ class QlikDataset(Item):
|
|
|
148
160
|
return values
|
|
149
161
|
|
|
150
162
|
|
|
151
|
-
class AxisProperty(
|
|
163
|
+
class AxisProperty(_QlikBaseModel):
|
|
152
164
|
Title: str = Field(alias="qFallbackTitle")
|
|
153
165
|
Min: str = Field(alias="qMin")
|
|
154
166
|
Max: str = Field(alias="qMax")
|
|
155
167
|
|
|
156
168
|
|
|
157
|
-
class Chart(
|
|
169
|
+
class Chart(_QlikBaseModel):
|
|
158
170
|
qId: str
|
|
159
171
|
visualization: str
|
|
160
172
|
title: str
|
|
@@ -164,13 +176,15 @@ class Chart(BaseModel):
|
|
|
164
176
|
|
|
165
177
|
@root_validator(pre=True)
|
|
166
178
|
def update_values(cls, values: Dict) -> Dict:
|
|
179
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
180
|
+
values = deepcopy(values)
|
|
167
181
|
values[Constant.QID] = values[Constant.QINFO][Constant.QID]
|
|
168
182
|
values["qDimension"] = values[Constant.HYPERCUBE]["qDimensionInfo"]
|
|
169
183
|
values["qMeasure"] = values[Constant.HYPERCUBE]["qMeasureInfo"]
|
|
170
184
|
return values
|
|
171
185
|
|
|
172
186
|
|
|
173
|
-
class Sheet(
|
|
187
|
+
class Sheet(_QlikBaseModel):
|
|
174
188
|
id: str
|
|
175
189
|
title: str
|
|
176
190
|
description: str
|
|
@@ -181,6 +195,8 @@ class Sheet(BaseModel):
|
|
|
181
195
|
|
|
182
196
|
@root_validator(pre=True)
|
|
183
197
|
def update_values(cls, values: Dict) -> Dict:
|
|
198
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
199
|
+
values = deepcopy(values)
|
|
184
200
|
values[Constant.CREATEDAT] = datetime.strptime(
|
|
185
201
|
values[Constant.CREATEDDATE], QLIK_DATETIME_FORMAT
|
|
186
202
|
)
|
|
@@ -190,7 +206,7 @@ class Sheet(BaseModel):
|
|
|
190
206
|
return values
|
|
191
207
|
|
|
192
208
|
|
|
193
|
-
class QlikTable(
|
|
209
|
+
class QlikTable(_QlikBaseModel):
|
|
194
210
|
tableName: str
|
|
195
211
|
type: BoxType = Field(alias="boxType")
|
|
196
212
|
tableAlias: str
|
|
@@ -206,6 +222,8 @@ class QlikTable(BaseModel):
|
|
|
206
222
|
|
|
207
223
|
@root_validator(pre=True)
|
|
208
224
|
def update_values(cls, values: Dict) -> Dict:
|
|
225
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
226
|
+
values = deepcopy(values)
|
|
209
227
|
values[Constant.DATACONNECTORID] = values[Constant.CONNECTIONINFO][Constant.ID]
|
|
210
228
|
values[Constant.DATACONNECTORPLATFORM] = values[Constant.CONNECTIONINFO][
|
|
211
229
|
Constant.SOURCECONNECTORID
|
|
@@ -223,6 +241,8 @@ class App(Item):
|
|
|
223
241
|
|
|
224
242
|
@root_validator(pre=True)
|
|
225
243
|
def update_values(cls, values: Dict) -> Dict:
|
|
244
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
245
|
+
values = deepcopy(values)
|
|
226
246
|
values[Constant.CREATEDAT] = datetime.strptime(
|
|
227
247
|
values[Constant.CREATEDDATE], QLIK_DATETIME_FORMAT
|
|
228
248
|
)
|
|
@@ -447,7 +447,7 @@ class RedashSource(StatefulIngestionSourceBase):
|
|
|
447
447
|
dataset_urns = sql_parser_in_tables.in_tables
|
|
448
448
|
if sql_parser_in_tables.debug_info.table_error:
|
|
449
449
|
self.report.queries_problem_parsing.add(str(query_id))
|
|
450
|
-
self.
|
|
450
|
+
self.warn(
|
|
451
451
|
logger,
|
|
452
452
|
"sql-parsing",
|
|
453
453
|
f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from copy import deepcopy
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import Any, Dict, List, Optional
|
|
4
5
|
|
|
@@ -6,7 +7,7 @@ from pydantic import root_validator
|
|
|
6
7
|
from pydantic.fields import Field
|
|
7
8
|
|
|
8
9
|
from datahub.configuration import ConfigModel
|
|
9
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
10
11
|
from datahub.configuration.source_common import DatasetLineageProviderConfigBase
|
|
11
12
|
from datahub.configuration.validate_field_removal import pydantic_removed_field
|
|
12
13
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
@@ -95,10 +96,9 @@ class RedshiftConfig(
|
|
|
95
96
|
# Because of this behavior, it uses dramatically fewer round trips for
|
|
96
97
|
# large Redshift warehouses. As an example, see this query for the columns:
|
|
97
98
|
# https://github.com/sqlalchemy-redshift/sqlalchemy-redshift/blob/60b4db04c1d26071c291aeea52f1dcb5dd8b0eb0/sqlalchemy_redshift/dialect.py#L745.
|
|
98
|
-
scheme: str = Field(
|
|
99
|
+
scheme: HiddenFromDocs[str] = Field(
|
|
99
100
|
default="redshift+redshift_connector",
|
|
100
101
|
description="",
|
|
101
|
-
hidden_from_docs=True,
|
|
102
102
|
)
|
|
103
103
|
|
|
104
104
|
_database_alias_removed = pydantic_removed_field("database_alias")
|
|
@@ -216,6 +216,9 @@ class RedshiftConfig(
|
|
|
216
216
|
|
|
217
217
|
@root_validator(skip_on_failure=True)
|
|
218
218
|
def connection_config_compatibility_set(cls, values: Dict) -> Dict:
|
|
219
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
220
|
+
values = deepcopy(values)
|
|
221
|
+
|
|
219
222
|
if (
|
|
220
223
|
("options" in values and "connect_args" in values["options"])
|
|
221
224
|
and "extra_client_options" in values
|