acryl-datahub 1.2.0.9rc2__py3-none-any.whl → 1.2.0.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (118) hide show
  1. {acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2553 -2611
  2. {acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +118 -111
  3. {acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
  4. datahub/_version.py +1 -1
  5. datahub/api/entities/assertion/assertion.py +1 -1
  6. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  7. datahub/api/entities/dataproduct/dataproduct.py +6 -3
  8. datahub/api/entities/dataset/dataset.py +9 -18
  9. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  10. datahub/api/graphql/operation.py +10 -6
  11. datahub/cli/docker_check.py +2 -2
  12. datahub/configuration/common.py +29 -1
  13. datahub/configuration/connection_resolver.py +5 -2
  14. datahub/configuration/import_resolver.py +7 -4
  15. datahub/configuration/pydantic_migration_helpers.py +0 -9
  16. datahub/configuration/source_common.py +3 -2
  17. datahub/configuration/validate_field_deprecation.py +5 -2
  18. datahub/configuration/validate_field_removal.py +5 -2
  19. datahub/configuration/validate_field_rename.py +6 -5
  20. datahub/configuration/validate_multiline_string.py +5 -2
  21. datahub/ingestion/autogenerated/capability_summary.json +45 -1
  22. datahub/ingestion/run/pipeline_config.py +2 -2
  23. datahub/ingestion/source/azure/azure_common.py +1 -1
  24. datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
  25. datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
  26. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  27. datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
  28. datahub/ingestion/source/datahub/config.py +8 -9
  29. datahub/ingestion/source/dbt/dbt_common.py +65 -5
  30. datahub/ingestion/source/delta_lake/config.py +1 -1
  31. datahub/ingestion/source/dremio/dremio_config.py +3 -4
  32. datahub/ingestion/source/feast.py +8 -10
  33. datahub/ingestion/source/fivetran/config.py +1 -1
  34. datahub/ingestion/source/gcs/gcs_source.py +19 -2
  35. datahub/ingestion/source/ge_data_profiler.py +15 -2
  36. datahub/ingestion/source/ge_profiling_config.py +26 -22
  37. datahub/ingestion/source/grafana/grafana_config.py +2 -2
  38. datahub/ingestion/source/grafana/models.py +12 -14
  39. datahub/ingestion/source/hex/hex.py +6 -1
  40. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  41. datahub/ingestion/source/kafka_connect/common.py +2 -2
  42. datahub/ingestion/source/looker/looker_common.py +76 -75
  43. datahub/ingestion/source/looker/looker_config.py +15 -4
  44. datahub/ingestion/source/looker/looker_source.py +493 -547
  45. datahub/ingestion/source/looker/lookml_config.py +1 -1
  46. datahub/ingestion/source/looker/lookml_source.py +46 -88
  47. datahub/ingestion/source/metabase.py +9 -2
  48. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  49. datahub/ingestion/source/metadata/lineage.py +1 -1
  50. datahub/ingestion/source/mode.py +13 -5
  51. datahub/ingestion/source/nifi.py +1 -1
  52. datahub/ingestion/source/powerbi/config.py +14 -21
  53. datahub/ingestion/source/preset.py +1 -1
  54. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  55. datahub/ingestion/source/redash.py +1 -1
  56. datahub/ingestion/source/redshift/config.py +6 -3
  57. datahub/ingestion/source/redshift/query.py +23 -19
  58. datahub/ingestion/source/s3/source.py +26 -24
  59. datahub/ingestion/source/salesforce.py +13 -9
  60. datahub/ingestion/source/schema/json_schema.py +14 -14
  61. datahub/ingestion/source/sigma/data_classes.py +3 -0
  62. datahub/ingestion/source/snaplogic/__init__.py +0 -0
  63. datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
  64. datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
  65. datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
  66. datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
  67. datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
  68. datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
  69. datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
  70. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
  71. datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
  72. datahub/ingestion/source/sql/athena.py +2 -1
  73. datahub/ingestion/source/sql/clickhouse.py +12 -7
  74. datahub/ingestion/source/sql/cockroachdb.py +5 -3
  75. datahub/ingestion/source/sql/druid.py +2 -2
  76. datahub/ingestion/source/sql/hive.py +4 -3
  77. datahub/ingestion/source/sql/hive_metastore.py +7 -9
  78. datahub/ingestion/source/sql/mssql/source.py +2 -2
  79. datahub/ingestion/source/sql/mysql.py +2 -2
  80. datahub/ingestion/source/sql/oracle.py +3 -3
  81. datahub/ingestion/source/sql/presto.py +2 -1
  82. datahub/ingestion/source/sql/teradata.py +4 -4
  83. datahub/ingestion/source/sql/trino.py +2 -1
  84. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  85. datahub/ingestion/source/sql/vertica.py +1 -1
  86. datahub/ingestion/source/sql_queries.py +6 -6
  87. datahub/ingestion/source/state/checkpoint.py +5 -1
  88. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  89. datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
  90. datahub/ingestion/source/superset.py +122 -15
  91. datahub/ingestion/source/tableau/tableau.py +68 -14
  92. datahub/ingestion/source/tableau/tableau_common.py +5 -0
  93. datahub/ingestion/source/tableau/tableau_constant.py +1 -0
  94. datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
  95. datahub/ingestion/source/unity/config.py +7 -3
  96. datahub/ingestion/source/usage/usage_common.py +3 -3
  97. datahub/ingestion/source_config/pulsar.py +3 -1
  98. datahub/ingestion/transformer/set_browse_path.py +112 -0
  99. datahub/metadata/_internal_schema_classes.py +728 -528
  100. datahub/metadata/_urns/urn_defs.py +1702 -1702
  101. datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
  102. datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
  103. datahub/metadata/schema.avsc +17434 -17732
  104. datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
  105. datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
  106. datahub/metadata/schemas/LogicalParent.avsc +2 -1
  107. datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
  108. datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
  109. datahub/sdk/_shared.py +126 -0
  110. datahub/sdk/chart.py +87 -30
  111. datahub/sdk/dashboard.py +79 -34
  112. datahub/sdk/entity_client.py +11 -4
  113. datahub/sdk/lineage_client.py +3 -3
  114. datahub/sdk/search_filters.py +1 -7
  115. datahub/sql_parsing/split_statements.py +13 -0
  116. {acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
  117. {acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
  118. {acryl_datahub-1.2.0.9rc2.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0
@@ -122,7 +122,7 @@ class LookMLSourceConfig(
122
122
  description="List of regex patterns for LookML views to include in the extraction.",
123
123
  )
124
124
  parse_table_names_from_sql: bool = Field(True, description="See note below.")
125
- api: Optional[LookerAPIConfig]
125
+ api: Optional[LookerAPIConfig] = None
126
126
  project_name: Optional[str] = Field(
127
127
  None,
128
128
  description="Required if you don't specify the `api` section. The project name within which all the model "
@@ -4,7 +4,7 @@ import tempfile
4
4
  from collections import OrderedDict
5
5
  from dataclasses import dataclass
6
6
  from datetime import datetime, timezone
7
- from typing import Dict, Iterable, List, Optional, Set, Tuple
7
+ from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
8
8
 
9
9
  import lkml
10
10
  import lkml.simple
@@ -12,8 +12,7 @@ from looker_sdk.error import SDKError
12
12
 
13
13
  from datahub.configuration.git import GitInfo
14
14
  from datahub.emitter.mce_builder import make_schema_field_urn
15
- from datahub.emitter.mcp import MetadataChangeProposalWrapper
16
- from datahub.emitter.mcp_builder import gen_containers
15
+ from datahub.emitter.mcp_builder import mcps_from_mce
17
16
  from datahub.ingestion.api.common import PipelineContext
18
17
  from datahub.ingestion.api.decorators import (
19
18
  SupportStatus,
@@ -77,7 +76,7 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import (
77
76
  from datahub.ingestion.source.state.stateful_ingestion_base import (
78
77
  StatefulIngestionSourceBase,
79
78
  )
80
- from datahub.metadata.com.linkedin.pegasus2avro.common import BrowsePaths, Status
79
+ from datahub.metadata.com.linkedin.pegasus2avro.common import Status
81
80
  from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
82
81
  DatasetLineageTypeClass,
83
82
  FineGrainedLineageDownstreamType,
@@ -85,18 +84,15 @@ from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
85
84
  UpstreamLineage,
86
85
  ViewProperties,
87
86
  )
88
- from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
89
- from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
90
87
  from datahub.metadata.schema_classes import (
91
88
  AuditStampClass,
92
- BrowsePathEntryClass,
93
- BrowsePathsV2Class,
94
- ContainerClass,
95
89
  DatasetPropertiesClass,
96
90
  FineGrainedLineageClass,
97
91
  FineGrainedLineageUpstreamTypeClass,
98
- SubTypesClass,
99
92
  )
93
+ from datahub.sdk.container import Container
94
+ from datahub.sdk.dataset import Dataset
95
+ from datahub.sdk.entity import Entity
100
96
  from datahub.sql_parsing.sqlglot_lineage import ColumnRef
101
97
 
102
98
  VIEW_LANGUAGE_LOOKML: str = "lookml"
@@ -428,69 +424,40 @@ class LookMLSource(StatefulIngestionSourceBase):
428
424
 
429
425
  return dataset_props
430
426
 
431
- def _build_dataset_mcps(
432
- self, looker_view: LookerView
433
- ) -> List[MetadataChangeProposalWrapper]:
434
- view_urn = looker_view.id.get_urn(self.source_config)
435
-
436
- subTypeEvent = MetadataChangeProposalWrapper(
437
- entityUrn=view_urn,
438
- aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW]),
439
- )
440
- events = [subTypeEvent]
427
+ def _build_dataset_entities(self, looker_view: LookerView) -> Iterable[Dataset]:
428
+ dataset_extra_aspects: List[Union[ViewProperties, Status]] = [
429
+ Status(removed=False)
430
+ ]
441
431
  if looker_view.view_details is not None:
442
- viewEvent = MetadataChangeProposalWrapper(
443
- entityUrn=view_urn,
444
- aspect=looker_view.view_details,
445
- )
446
- events.append(viewEvent)
447
-
448
- project_key = gen_project_key(self.source_config, looker_view.id.project_name)
449
-
450
- container = ContainerClass(container=project_key.as_urn())
451
- events.append(
452
- MetadataChangeProposalWrapper(entityUrn=view_urn, aspect=container)
453
- )
454
-
455
- events.append(
456
- MetadataChangeProposalWrapper(
457
- entityUrn=view_urn,
458
- aspect=looker_view.id.get_browse_path_v2(self.source_config),
459
- )
460
- )
461
-
462
- return events
463
-
464
- def _build_dataset_mce(self, looker_view: LookerView) -> MetadataChangeEvent:
465
- """
466
- Creates MetadataChangeEvent for the dataset, creating upstream lineage links
467
- """
468
- logger.debug(f"looker_view = {looker_view.id}")
432
+ dataset_extra_aspects.append(looker_view.view_details)
469
433
 
470
- dataset_snapshot = DatasetSnapshot(
471
- urn=looker_view.id.get_urn(self.source_config),
472
- aspects=[], # we append to this list later on
473
- )
474
- browse_paths = BrowsePaths(
475
- paths=[looker_view.id.get_browse_path(self.source_config)]
476
- )
477
-
478
- dataset_snapshot.aspects.append(browse_paths)
479
- dataset_snapshot.aspects.append(Status(removed=False))
480
- upstream_lineage = self._get_upstream_lineage(looker_view)
481
- if upstream_lineage is not None:
482
- dataset_snapshot.aspects.append(upstream_lineage)
483
434
  schema_metadata = LookerUtil._get_schema(
484
435
  self.source_config.platform_name,
485
436
  looker_view.id.view_name,
486
437
  looker_view.fields,
487
438
  self.reporter,
488
439
  )
489
- if schema_metadata is not None:
490
- dataset_snapshot.aspects.append(schema_metadata)
491
- dataset_snapshot.aspects.append(self._get_custom_properties(looker_view))
492
440
 
493
- return MetadataChangeEvent(proposedSnapshot=dataset_snapshot)
441
+ custom_properties: DatasetPropertiesClass = self._get_custom_properties(
442
+ looker_view
443
+ )
444
+
445
+ yield Dataset(
446
+ platform=self.source_config.platform_name,
447
+ name=looker_view.id.get_view_dataset_name(self.source_config),
448
+ display_name=looker_view.id.view_name,
449
+ platform_instance=self.source_config.platform_instance,
450
+ env=self.source_config.env,
451
+ subtype=DatasetSubTypes.VIEW,
452
+ parent_container=looker_view.id.get_view_dataset_parent_container(
453
+ self.source_config
454
+ ),
455
+ schema=schema_metadata,
456
+ custom_properties=custom_properties.customProperties,
457
+ external_url=custom_properties.externalUrl,
458
+ upstreams=self._get_upstream_lineage(looker_view),
459
+ extra_aspects=dataset_extra_aspects,
460
+ )
494
461
 
495
462
  def get_project_name(self, model_name: str) -> str:
496
463
  if self.source_config.project_name is not None:
@@ -554,7 +521,7 @@ class LookMLSource(StatefulIngestionSourceBase):
554
521
  ).workunit_processor,
555
522
  ]
556
523
 
557
- def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
524
+ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
558
525
  with tempfile.TemporaryDirectory("lookml_tmp") as tmp_dir:
559
526
  # Clone the base_folder if necessary.
560
527
  if not self.source_config.base_folder:
@@ -715,7 +682,7 @@ class LookMLSource(StatefulIngestionSourceBase):
715
682
  tmp_dir, project, project_visited, manifest_constants
716
683
  )
717
684
 
718
- def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
685
+ def get_internal_workunits(self) -> Iterable[Union[MetadataWorkUnit, Entity]]: # noqa: C901
719
686
  assert self.source_config.base_folder
720
687
  viewfile_loader = LookerViewFileLoader(
721
688
  self.source_config.project_name,
@@ -949,7 +916,7 @@ class LookMLSource(StatefulIngestionSourceBase):
949
916
  maybe_looker_view.id.project_name
950
917
  not in self.processed_projects
951
918
  ):
952
- yield from self.gen_project_workunits(
919
+ yield from self.gen_project_containers(
953
920
  maybe_looker_view.id.project_name
954
921
  )
955
922
 
@@ -957,15 +924,10 @@ class LookMLSource(StatefulIngestionSourceBase):
957
924
  maybe_looker_view.id.project_name
958
925
  )
959
926
 
960
- for mcp in self._build_dataset_mcps(
927
+ yield from self._build_dataset_entities(
961
928
  maybe_looker_view
962
- ):
963
- yield mcp.as_workunit()
964
- mce = self._build_dataset_mce(maybe_looker_view)
965
- yield MetadataWorkUnit(
966
- id=f"lookml-view-{maybe_looker_view.id}",
967
- mce=mce,
968
929
  )
930
+
969
931
  processed_view_files.add(include.include)
970
932
  else:
971
933
  (
@@ -994,28 +956,24 @@ class LookMLSource(StatefulIngestionSourceBase):
994
956
  self.source_config.tag_measures_and_dimensions
995
957
  and self.reporter.events_produced != 0
996
958
  ):
997
- # Emit tag MCEs for measures and dimensions:
959
+ # Emit tag MCEs for measures and dimensions if we produced any explores:
998
960
  for tag_mce in LookerUtil.get_tag_mces():
999
- yield MetadataWorkUnit(
1000
- id=f"tag-{tag_mce.proposedSnapshot.urn}", mce=tag_mce
1001
- )
961
+ # Convert MCE to MCPs
962
+ for mcp in mcps_from_mce(tag_mce):
963
+ yield mcp.as_workunit()
1002
964
 
1003
- def gen_project_workunits(self, project_name: str) -> Iterable[MetadataWorkUnit]:
965
+ def gen_project_containers(self, project_name: str) -> Iterable[Container]:
1004
966
  project_key = gen_project_key(
1005
967
  self.source_config,
1006
968
  project_name,
1007
969
  )
1008
- yield from gen_containers(
970
+
971
+ yield Container(
1009
972
  container_key=project_key,
1010
- name=project_name,
1011
- sub_types=[BIContainerSubTypes.LOOKML_PROJECT],
973
+ display_name=project_name,
974
+ subtype=BIContainerSubTypes.LOOKML_PROJECT,
975
+ parent_container=["Folders"],
1012
976
  )
1013
- yield MetadataChangeProposalWrapper(
1014
- entityUrn=project_key.as_urn(),
1015
- aspect=BrowsePathsV2Class(
1016
- path=[BrowsePathEntryClass("Folders")],
1017
- ),
1018
- ).as_workunit()
1019
977
 
1020
978
  def report_skipped_unreachable_views(
1021
979
  self,
@@ -13,7 +13,10 @@ from pydantic import Field, root_validator, validator
13
13
  from requests.models import HTTPError
14
14
 
15
15
  import datahub.emitter.mce_builder as builder
16
- from datahub.configuration.source_common import DatasetLineageProviderConfigBase
16
+ from datahub.configuration.source_common import (
17
+ DatasetLineageProviderConfigBase,
18
+ LowerCaseDatasetUrnConfigMixin,
19
+ )
17
20
  from datahub.ingestion.api.common import PipelineContext
18
21
  from datahub.ingestion.api.decorators import (
19
22
  SourceCapability,
@@ -61,7 +64,11 @@ logger = logging.getLogger(__name__)
61
64
  DATASOURCE_URN_RECURSION_LIMIT = 5
62
65
 
63
66
 
64
- class MetabaseConfig(DatasetLineageProviderConfigBase, StatefulIngestionConfigBase):
67
+ class MetabaseConfig(
68
+ DatasetLineageProviderConfigBase,
69
+ StatefulIngestionConfigBase,
70
+ LowerCaseDatasetUrnConfigMixin,
71
+ ):
65
72
  # See the Metabase /api/session endpoint for details
66
73
  # https://www.metabase.com/docs/latest/api-documentation.html#post-apisession
67
74
  connect_uri: str = Field(default="localhost:3000", description="Metabase host URL.")
@@ -5,11 +5,11 @@ import time
5
5
  from dataclasses import dataclass, field
6
6
  from typing import Any, Dict, Iterable, List, Optional, TypeVar, Union
7
7
 
8
- from pydantic import validator
8
+ import pydantic
9
9
  from pydantic.fields import Field
10
10
 
11
11
  import datahub.metadata.schema_classes as models
12
- from datahub.configuration.common import ConfigModel
12
+ from datahub.configuration.common import ConfigModel, LaxStr
13
13
  from datahub.configuration.config_loader import load_config_file
14
14
  from datahub.emitter.mce_builder import (
15
15
  datahub_guid,
@@ -66,7 +66,7 @@ class GlossaryTermConfig(ConfigModel):
66
66
  contains: Optional[List[str]] = None
67
67
  values: Optional[List[str]] = None
68
68
  related_terms: Optional[List[str]] = None
69
- custom_properties: Optional[Dict[str, str]] = None
69
+ custom_properties: Optional[Dict[str, LaxStr]] = None
70
70
  knowledge_links: Optional[List[KnowledgeCard]] = None
71
71
  domain: Optional[str] = None
72
72
 
@@ -82,7 +82,7 @@ class GlossaryNodeConfig(ConfigModel):
82
82
  terms: Optional[List["GlossaryTermConfig"]] = None
83
83
  nodes: Optional[List["GlossaryNodeConfig"]] = None
84
84
  knowledge_links: Optional[List[KnowledgeCard]] = None
85
- custom_properties: Optional[Dict[str, str]] = None
85
+ custom_properties: Optional[Dict[str, LaxStr]] = None
86
86
 
87
87
  # Private fields.
88
88
  _urn: str
@@ -108,12 +108,12 @@ class BusinessGlossarySourceConfig(ConfigModel):
108
108
 
109
109
 
110
110
  class BusinessGlossaryConfig(DefaultConfig):
111
- version: str
111
+ version: LaxStr
112
112
  terms: Optional[List["GlossaryTermConfig"]] = None
113
113
  nodes: Optional[List["GlossaryNodeConfig"]] = None
114
114
 
115
- @validator("version")
116
- def version_must_be_1(cls, v):
115
+ @pydantic.field_validator("version", mode="after")
116
+ def version_must_be_1(cls, v: str) -> str:
117
117
  if v != "1":
118
118
  raise ValueError("Only version 1 is supported")
119
119
  return v
@@ -49,7 +49,7 @@ class EntityConfig(EnvConfigMixin):
49
49
  name: str
50
50
  type: str
51
51
  platform: str
52
- platform_instance: Optional[str]
52
+ platform_instance: Optional[str] = None
53
53
 
54
54
  @validator("type")
55
55
  def type_must_be_supported(cls, v: str) -> str:
@@ -7,7 +7,16 @@ from dataclasses import dataclass
7
7
  from datetime import datetime, timezone
8
8
  from functools import lru_cache
9
9
  from json import JSONDecodeError
10
- from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
10
+ from typing import (
11
+ Dict,
12
+ Iterable,
13
+ Iterator,
14
+ List,
15
+ Optional,
16
+ Set,
17
+ Tuple,
18
+ Union,
19
+ )
11
20
 
12
21
  import dateutil.parser as dp
13
22
  import psutil
@@ -24,7 +33,7 @@ from requests.models import HTTPBasicAuth, HTTPError
24
33
  from tenacity import retry_if_exception_type, stop_after_attempt, wait_exponential
25
34
 
26
35
  import datahub.emitter.mce_builder as builder
27
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
36
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
28
37
  from datahub.configuration.source_common import (
29
38
  DatasetLineageProviderConfigBase,
30
39
  )
@@ -200,10 +209,9 @@ class ModeConfig(
200
209
  default=True, description="Tag measures and dimensions in the schema"
201
210
  )
202
211
 
203
- items_per_page: int = Field(
204
- default=DEFAULT_API_ITEMS_PER_PAGE,
212
+ items_per_page: HiddenFromDocs[int] = Field(
213
+ DEFAULT_API_ITEMS_PER_PAGE,
205
214
  description="Number of items per page for paginated API requests.",
206
- hidden_from_docs=True,
207
215
  )
208
216
 
209
217
  @validator("connect_uri")
@@ -166,7 +166,7 @@ class NifiSourceConfig(StatefulIngestionConfigBase, EnvConfigMixin):
166
166
  )
167
167
 
168
168
  @root_validator(skip_on_failure=True)
169
- def validate_auth_params(cla, values):
169
+ def validate_auth_params(cls, values):
170
170
  if values.get("auth") is NifiAuthType.CLIENT_CERT and not values.get(
171
171
  "client_cert_file"
172
172
  ):
@@ -4,11 +4,10 @@ from enum import Enum
4
4
  from typing import Dict, List, Literal, Optional, Union
5
5
 
6
6
  import pydantic
7
- from pydantic import validator
8
- from pydantic.class_validators import root_validator
7
+ from pydantic import root_validator, validator
9
8
 
10
9
  import datahub.emitter.mce_builder as builder
11
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
10
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
12
11
  from datahub.configuration.source_common import DatasetSourceConfigMixin, PlatformDetail
13
12
  from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
14
13
  from datahub.ingestion.api.incremental_lineage_helper import (
@@ -291,22 +290,18 @@ class PowerBiProfilingConfig(ConfigModel):
291
290
  class PowerBiDashboardSourceConfig(
292
291
  StatefulIngestionConfigBase, DatasetSourceConfigMixin, IncrementalLineageConfigMixin
293
292
  ):
294
- platform_name: str = pydantic.Field(
295
- default=Constant.PLATFORM_NAME, hidden_from_docs=True
296
- )
293
+ platform_name: HiddenFromDocs[str] = pydantic.Field(default=Constant.PLATFORM_NAME)
297
294
 
298
- platform_urn: str = pydantic.Field(
295
+ platform_urn: HiddenFromDocs[str] = pydantic.Field(
299
296
  default=builder.make_data_platform_urn(platform=Constant.PLATFORM_NAME),
300
- hidden_from_docs=True,
301
297
  )
302
298
 
303
299
  # Organization Identifier
304
300
  tenant_id: str = pydantic.Field(description="PowerBI tenant identifier")
305
301
  # PowerBi workspace identifier
306
- workspace_id: Optional[str] = pydantic.Field(
302
+ workspace_id: HiddenFromDocs[Optional[str]] = pydantic.Field(
307
303
  default=None,
308
304
  description="[deprecated] Use workspace_id_pattern instead",
309
- hidden_from_docs=True,
310
305
  )
311
306
  # PowerBi workspace identifier
312
307
  workspace_id_pattern: AllowDenyPattern = pydantic.Field(
@@ -326,15 +321,14 @@ class PowerBiDashboardSourceConfig(
326
321
  # Dataset type mapping PowerBI support many type of data-sources. Here user needs to define what type of PowerBI
327
322
  # DataSource needs to be mapped to corresponding DataHub Platform DataSource. For example, PowerBI `Snowflake` is
328
323
  # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on.
329
- dataset_type_mapping: Union[Dict[str, str], Dict[str, PlatformDetail]] = (
330
- pydantic.Field(
331
- default_factory=default_for_dataset_type_mapping,
332
- description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
333
- "DataHub supported datasources."
334
- "You can configured platform instance for dataset lineage. "
335
- "See Quickstart Recipe for mapping",
336
- hidden_from_docs=True,
337
- )
324
+ dataset_type_mapping: HiddenFromDocs[
325
+ Union[Dict[str, str], Dict[str, PlatformDetail]]
326
+ ] = pydantic.Field(
327
+ default_factory=default_for_dataset_type_mapping,
328
+ description="[deprecated] Use server_to_platform_instance instead. Mapping of PowerBI datasource type to "
329
+ "DataHub supported datasources."
330
+ "You can configured platform instance for dataset lineage. "
331
+ "See Quickstart Recipe for mapping",
338
332
  )
339
333
  # PowerBI datasource's server to platform instance mapping
340
334
  server_to_platform_instance: Dict[
@@ -541,10 +535,9 @@ class PowerBiDashboardSourceConfig(
541
535
  "Increase this value if you encounter the 'M-Query Parsing Timeout' message in the connector report.",
542
536
  )
543
537
 
544
- metadata_api_timeout: int = pydantic.Field(
538
+ metadata_api_timeout: HiddenFromDocs[int] = pydantic.Field(
545
539
  default=30,
546
540
  description="timeout in seconds for Metadata Rest Api.",
547
- hidden_from_docs=True,
548
541
  )
549
542
 
550
543
  @root_validator(skip_on_failure=True)
@@ -2,7 +2,7 @@ import logging
2
2
  from typing import Dict, Optional
3
3
 
4
4
  import requests
5
- from pydantic.class_validators import root_validator, validator
5
+ from pydantic import root_validator, validator
6
6
  from pydantic.fields import Field
7
7
 
8
8
  from datahub.emitter.mce_builder import DEFAULT_ENV
@@ -1,8 +1,9 @@
1
+ from copy import deepcopy
1
2
  from datetime import datetime
2
3
  from enum import Enum
3
4
  from typing import Dict, List, Optional, Type, Union
4
5
 
5
- from pydantic import BaseModel, Field, root_validator
6
+ from pydantic import BaseModel, ConfigDict, Field, root_validator
6
7
 
7
8
  from datahub.emitter.mcp_builder import ContainerKey
8
9
  from datahub.ingestion.source.qlik_sense.config import QLIK_DATETIME_FORMAT, Constant
@@ -78,7 +79,11 @@ PERSONAL_SPACE_DICT = {
78
79
  }
79
80
 
80
81
 
81
- class Space(BaseModel):
82
+ class _QlikBaseModel(BaseModel):
83
+ model_config = ConfigDict(coerce_numbers_to_str=True)
84
+
85
+
86
+ class Space(_QlikBaseModel):
82
87
  id: str
83
88
  name: str
84
89
  description: str
@@ -89,6 +94,9 @@ class Space(BaseModel):
89
94
 
90
95
  @root_validator(pre=True)
91
96
  def update_values(cls, values: Dict) -> Dict:
97
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
98
+ values = deepcopy(values)
99
+
92
100
  values[Constant.CREATEDAT] = datetime.strptime(
93
101
  values[Constant.CREATEDAT], QLIK_DATETIME_FORMAT
94
102
  )
@@ -98,7 +106,7 @@ class Space(BaseModel):
98
106
  return values
99
107
 
100
108
 
101
- class Item(BaseModel):
109
+ class Item(_QlikBaseModel):
102
110
  id: str
103
111
  description: str = ""
104
112
  ownerId: str
@@ -107,7 +115,7 @@ class Item(BaseModel):
107
115
  updatedAt: datetime
108
116
 
109
117
 
110
- class SchemaField(BaseModel):
118
+ class SchemaField(_QlikBaseModel):
111
119
  name: str
112
120
  dataType: Optional[str] = None
113
121
  primaryKey: Optional[bool] = None
@@ -115,6 +123,8 @@ class SchemaField(BaseModel):
115
123
 
116
124
  @root_validator(pre=True)
117
125
  def update_values(cls, values: Dict) -> Dict:
126
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
127
+ values = deepcopy(values)
118
128
  values[Constant.DATATYPE] = values.get(Constant.DATATYPE, {}).get(Constant.TYPE)
119
129
  return values
120
130
 
@@ -130,6 +140,8 @@ class QlikDataset(Item):
130
140
 
131
141
  @root_validator(pre=True)
132
142
  def update_values(cls, values: Dict) -> Dict:
143
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
144
+ values = deepcopy(values)
133
145
  # Update str time to datetime
134
146
  values[Constant.CREATEDAT] = datetime.strptime(
135
147
  values[Constant.CREATEDTIME], QLIK_DATETIME_FORMAT
@@ -148,13 +160,13 @@ class QlikDataset(Item):
148
160
  return values
149
161
 
150
162
 
151
- class AxisProperty(BaseModel):
163
+ class AxisProperty(_QlikBaseModel):
152
164
  Title: str = Field(alias="qFallbackTitle")
153
165
  Min: str = Field(alias="qMin")
154
166
  Max: str = Field(alias="qMax")
155
167
 
156
168
 
157
- class Chart(BaseModel):
169
+ class Chart(_QlikBaseModel):
158
170
  qId: str
159
171
  visualization: str
160
172
  title: str
@@ -164,13 +176,15 @@ class Chart(BaseModel):
164
176
 
165
177
  @root_validator(pre=True)
166
178
  def update_values(cls, values: Dict) -> Dict:
179
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
180
+ values = deepcopy(values)
167
181
  values[Constant.QID] = values[Constant.QINFO][Constant.QID]
168
182
  values["qDimension"] = values[Constant.HYPERCUBE]["qDimensionInfo"]
169
183
  values["qMeasure"] = values[Constant.HYPERCUBE]["qMeasureInfo"]
170
184
  return values
171
185
 
172
186
 
173
- class Sheet(BaseModel):
187
+ class Sheet(_QlikBaseModel):
174
188
  id: str
175
189
  title: str
176
190
  description: str
@@ -181,6 +195,8 @@ class Sheet(BaseModel):
181
195
 
182
196
  @root_validator(pre=True)
183
197
  def update_values(cls, values: Dict) -> Dict:
198
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
199
+ values = deepcopy(values)
184
200
  values[Constant.CREATEDAT] = datetime.strptime(
185
201
  values[Constant.CREATEDDATE], QLIK_DATETIME_FORMAT
186
202
  )
@@ -190,7 +206,7 @@ class Sheet(BaseModel):
190
206
  return values
191
207
 
192
208
 
193
- class QlikTable(BaseModel):
209
+ class QlikTable(_QlikBaseModel):
194
210
  tableName: str
195
211
  type: BoxType = Field(alias="boxType")
196
212
  tableAlias: str
@@ -206,6 +222,8 @@ class QlikTable(BaseModel):
206
222
 
207
223
  @root_validator(pre=True)
208
224
  def update_values(cls, values: Dict) -> Dict:
225
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
226
+ values = deepcopy(values)
209
227
  values[Constant.DATACONNECTORID] = values[Constant.CONNECTIONINFO][Constant.ID]
210
228
  values[Constant.DATACONNECTORPLATFORM] = values[Constant.CONNECTIONINFO][
211
229
  Constant.SOURCECONNECTORID
@@ -223,6 +241,8 @@ class App(Item):
223
241
 
224
242
  @root_validator(pre=True)
225
243
  def update_values(cls, values: Dict) -> Dict:
244
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
245
+ values = deepcopy(values)
226
246
  values[Constant.CREATEDAT] = datetime.strptime(
227
247
  values[Constant.CREATEDDATE], QLIK_DATETIME_FORMAT
228
248
  )
@@ -447,7 +447,7 @@ class RedashSource(StatefulIngestionSourceBase):
447
447
  dataset_urns = sql_parser_in_tables.in_tables
448
448
  if sql_parser_in_tables.debug_info.table_error:
449
449
  self.report.queries_problem_parsing.add(str(query_id))
450
- self.error(
450
+ self.warn(
451
451
  logger,
452
452
  "sql-parsing",
453
453
  f"exception {sql_parser_in_tables.debug_info.table_error} in parsing query-{query_id}-datasource-{data_source_id}",
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ from copy import deepcopy
2
3
  from enum import Enum
3
4
  from typing import Any, Dict, List, Optional
4
5
 
@@ -6,7 +7,7 @@ from pydantic import root_validator
6
7
  from pydantic.fields import Field
7
8
 
8
9
  from datahub.configuration import ConfigModel
9
- from datahub.configuration.common import AllowDenyPattern
10
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
10
11
  from datahub.configuration.source_common import DatasetLineageProviderConfigBase
11
12
  from datahub.configuration.validate_field_removal import pydantic_removed_field
12
13
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
@@ -95,10 +96,9 @@ class RedshiftConfig(
95
96
  # Because of this behavior, it uses dramatically fewer round trips for
96
97
  # large Redshift warehouses. As an example, see this query for the columns:
97
98
  # https://github.com/sqlalchemy-redshift/sqlalchemy-redshift/blob/60b4db04c1d26071c291aeea52f1dcb5dd8b0eb0/sqlalchemy_redshift/dialect.py#L745.
98
- scheme: str = Field(
99
+ scheme: HiddenFromDocs[str] = Field(
99
100
  default="redshift+redshift_connector",
100
101
  description="",
101
- hidden_from_docs=True,
102
102
  )
103
103
 
104
104
  _database_alias_removed = pydantic_removed_field("database_alias")
@@ -216,6 +216,9 @@ class RedshiftConfig(
216
216
 
217
217
  @root_validator(skip_on_failure=True)
218
218
  def connection_config_compatibility_set(cls, values: Dict) -> Dict:
219
+ # Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
220
+ values = deepcopy(values)
221
+
219
222
  if (
220
223
  ("options" in values and "connect_args" in values["options"])
221
224
  and "extra_client_options" in values