acryl-datahub 1.2.0.10rc3__py3-none-any.whl → 1.2.0.10rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (82) hide show
  1. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/METADATA +2668 -2752
  2. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/RECORD +82 -82
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/assertion/assertion.py +1 -1
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/dataproduct/dataproduct.py +6 -3
  7. datahub/api/entities/dataset/dataset.py +9 -18
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/api/graphql/operation.py +10 -6
  10. datahub/cli/docker_check.py +2 -2
  11. datahub/configuration/common.py +29 -1
  12. datahub/configuration/connection_resolver.py +5 -2
  13. datahub/configuration/import_resolver.py +7 -4
  14. datahub/configuration/pydantic_migration_helpers.py +0 -9
  15. datahub/configuration/source_common.py +3 -2
  16. datahub/configuration/validate_field_deprecation.py +5 -2
  17. datahub/configuration/validate_field_removal.py +5 -2
  18. datahub/configuration/validate_field_rename.py +6 -5
  19. datahub/configuration/validate_multiline_string.py +5 -2
  20. datahub/ingestion/run/pipeline_config.py +2 -2
  21. datahub/ingestion/source/azure/azure_common.py +1 -1
  22. datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
  23. datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
  24. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  25. datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
  26. datahub/ingestion/source/datahub/config.py +8 -9
  27. datahub/ingestion/source/delta_lake/config.py +1 -1
  28. datahub/ingestion/source/dremio/dremio_config.py +3 -4
  29. datahub/ingestion/source/feast.py +8 -10
  30. datahub/ingestion/source/fivetran/config.py +1 -1
  31. datahub/ingestion/source/ge_profiling_config.py +26 -22
  32. datahub/ingestion/source/grafana/grafana_config.py +2 -2
  33. datahub/ingestion/source/grafana/models.py +12 -14
  34. datahub/ingestion/source/hex/hex.py +6 -1
  35. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  36. datahub/ingestion/source/kafka_connect/common.py +2 -2
  37. datahub/ingestion/source/looker/looker_common.py +1 -1
  38. datahub/ingestion/source/looker/looker_config.py +15 -4
  39. datahub/ingestion/source/looker/lookml_config.py +1 -1
  40. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  41. datahub/ingestion/source/metadata/lineage.py +1 -1
  42. datahub/ingestion/source/mode.py +13 -5
  43. datahub/ingestion/source/nifi.py +1 -1
  44. datahub/ingestion/source/powerbi/config.py +14 -21
  45. datahub/ingestion/source/preset.py +1 -1
  46. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  47. datahub/ingestion/source/redshift/config.py +6 -3
  48. datahub/ingestion/source/salesforce.py +13 -9
  49. datahub/ingestion/source/schema/json_schema.py +14 -14
  50. datahub/ingestion/source/sigma/data_classes.py +3 -0
  51. datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
  52. datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
  53. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
  54. datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
  55. datahub/ingestion/source/sql/athena.py +2 -1
  56. datahub/ingestion/source/sql/clickhouse.py +12 -7
  57. datahub/ingestion/source/sql/cockroachdb.py +5 -3
  58. datahub/ingestion/source/sql/druid.py +2 -2
  59. datahub/ingestion/source/sql/hive.py +4 -3
  60. datahub/ingestion/source/sql/hive_metastore.py +7 -9
  61. datahub/ingestion/source/sql/mssql/source.py +2 -2
  62. datahub/ingestion/source/sql/mysql.py +2 -2
  63. datahub/ingestion/source/sql/oracle.py +3 -3
  64. datahub/ingestion/source/sql/presto.py +2 -1
  65. datahub/ingestion/source/sql/teradata.py +4 -4
  66. datahub/ingestion/source/sql/trino.py +2 -1
  67. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  68. datahub/ingestion/source/sql/vertica.py +1 -1
  69. datahub/ingestion/source/sql_queries.py +6 -6
  70. datahub/ingestion/source/state/checkpoint.py +5 -1
  71. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  72. datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
  73. datahub/ingestion/source/superset.py +1 -2
  74. datahub/ingestion/source/tableau/tableau.py +17 -3
  75. datahub/ingestion/source/unity/config.py +7 -3
  76. datahub/ingestion/source/usage/usage_common.py +3 -3
  77. datahub/ingestion/source_config/pulsar.py +3 -1
  78. datahub/sdk/search_filters.py +1 -7
  79. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/WHEEL +0 -0
  80. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/entry_points.txt +0 -0
  81. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/licenses/LICENSE +0 -0
  82. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  from pydantic.fields import Field
2
2
 
3
- from datahub.configuration.common import AllowDenyPattern
3
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
4
4
  from datahub.ingestion.api.common import PipelineContext
5
5
  from datahub.ingestion.api.decorators import (
6
6
  SourceCapability,
@@ -14,8 +14,10 @@ from datahub.ingestion.source.sql.postgres import PostgresConfig, PostgresSource
14
14
 
15
15
 
16
16
  class CockroachDBConfig(PostgresConfig):
17
- scheme = Field(default="cockroachdb+psycopg2", description="database scheme")
18
- schema_pattern = Field(
17
+ scheme: HiddenFromDocs[str] = Field(
18
+ default="cockroachdb+psycopg2", description="database scheme"
19
+ )
20
+ schema_pattern: AllowDenyPattern = Field(
19
21
  default=AllowDenyPattern(deny=["information_schema", "crdb_internal"])
20
22
  )
21
23
 
@@ -6,7 +6,7 @@ from pydantic.fields import Field
6
6
  from pydruid.db.sqlalchemy import DruidDialect
7
7
  from sqlalchemy.exc import ResourceClosedError
8
8
 
9
- from datahub.configuration.common import AllowDenyPattern
9
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
10
10
  from datahub.ingestion.api.decorators import (
11
11
  SourceCapability,
12
12
  SupportStatus,
@@ -34,7 +34,7 @@ DruidDialect.get_table_names = get_table_names
34
34
 
35
35
  class DruidConfig(BasicSQLAlchemyConfig):
36
36
  # defaults
37
- scheme: str = "druid"
37
+ scheme: HiddenFromDocs[str] = "druid"
38
38
  schema_pattern: AllowDenyPattern = Field(
39
39
  default=AllowDenyPattern(deny=["^(lookup|sysgit|view).*"]),
40
40
  description="regex patterns for schemas to filter in ingestion.",
@@ -6,7 +6,7 @@ from enum import Enum
6
6
  from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
7
7
  from urllib.parse import urlparse
8
8
 
9
- from pydantic.class_validators import validator
9
+ from pydantic import validator
10
10
  from pydantic.fields import Field
11
11
 
12
12
  # This import verifies that the dependencies are available.
@@ -14,6 +14,7 @@ from pyhive import hive # noqa: F401
14
14
  from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveDialect, HiveTimestamp
15
15
  from sqlalchemy.engine.reflection import Inspector
16
16
 
17
+ from datahub.configuration.common import HiddenFromDocs
17
18
  from datahub.emitter.mce_builder import (
18
19
  make_data_platform_urn,
19
20
  make_dataplatform_instance_urn,
@@ -651,10 +652,10 @@ HiveDialect.get_view_definition = get_view_definition_patched
651
652
 
652
653
  class HiveConfig(TwoTierSQLAlchemyConfig):
653
654
  # defaults
654
- scheme: str = Field(default="hive", hidden_from_docs=True)
655
+ scheme: HiddenFromDocs[str] = Field(default="hive")
655
656
 
656
657
  # Overriding as table location lineage is richer implementation here than with include_table_location_lineage
657
- include_table_location_lineage: bool = Field(default=False, hidden_from_docs=True)
658
+ include_table_location_lineage: HiddenFromDocs[bool] = Field(default=False)
658
659
 
659
660
  emit_storage_lineage: bool = Field(
660
661
  default=False,
@@ -1,17 +1,15 @@
1
1
  import base64
2
+ import dataclasses
2
3
  import json
3
4
  import logging
4
5
  from collections import namedtuple
5
6
  from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
6
7
 
7
- from pydantic.dataclasses import dataclass
8
- from pydantic.fields import Field
9
-
10
- # This import verifies that the dependencies are available.
8
+ from pydantic import Field
11
9
  from sqlalchemy import create_engine, text
12
10
  from sqlalchemy.engine.reflection import Inspector
13
11
 
14
- from datahub.configuration.common import AllowDenyPattern
12
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
15
13
  from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
16
14
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
17
15
  from datahub.ingestion.api.common import PipelineContext
@@ -73,7 +71,7 @@ class HiveMetastoreConfigMode(StrEnum):
73
71
  trino = "trino"
74
72
 
75
73
 
76
- @dataclass
74
+ @dataclasses.dataclass
77
75
  class ViewDataset:
78
76
  dataset_name: str
79
77
  schema_name: str
@@ -99,7 +97,7 @@ class HiveMetastore(BasicSQLAlchemyConfig):
99
97
  default="localhost:3306",
100
98
  description="Host URL and port to connect to. Example: localhost:3306",
101
99
  )
102
- scheme: str = Field(default="mysql+pymysql", description="", hidden_from_docs=True)
100
+ scheme: HiddenFromDocs[str] = Field(default="mysql+pymysql")
103
101
 
104
102
  database_pattern: AllowDenyPattern = Field(
105
103
  default=AllowDenyPattern.allow_all(),
@@ -123,8 +121,8 @@ class HiveMetastore(BasicSQLAlchemyConfig):
123
121
  description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']",
124
122
  )
125
123
 
126
- include_view_lineage: bool = Field(
127
- default=False, description="", hidden_from_docs=True
124
+ include_view_lineage: HiddenFromDocs[bool] = Field(
125
+ default=False,
128
126
  )
129
127
 
130
128
  include_catalog_name_in_ids: bool = Field(
@@ -13,7 +13,7 @@ from sqlalchemy.exc import ProgrammingError, ResourceClosedError
13
13
  from sqlalchemy.sql import quoted_name
14
14
 
15
15
  import datahub.metadata.schema_classes as models
16
- from datahub.configuration.common import AllowDenyPattern
16
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
17
17
  from datahub.configuration.pattern_utils import UUID_REGEX
18
18
  from datahub.emitter.mcp import MetadataChangeProposalWrapper
19
19
  from datahub.ingestion.api.common import PipelineContext
@@ -75,7 +75,7 @@ DEFAULT_TEMP_TABLES_PATTERNS = [
75
75
  class SQLServerConfig(BasicSQLAlchemyConfig):
76
76
  # defaults
77
77
  host_port: str = Field(default="localhost:1433", description="MSSQL host URL.")
78
- scheme: str = Field(default="mssql+pytds", description="", hidden_from_docs=True)
78
+ scheme: HiddenFromDocs[str] = Field(default="mssql+pytds")
79
79
 
80
80
  # TODO: rename to include_procedures ?
81
81
  include_stored_procedures: bool = Field(
@@ -9,7 +9,7 @@ from sqlalchemy.dialects.mysql import BIT, base
9
9
  from sqlalchemy.dialects.mysql.enumerated import SET
10
10
  from sqlalchemy.engine.reflection import Inspector
11
11
 
12
- from datahub.configuration.common import AllowDenyPattern
12
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
13
13
  from datahub.ingestion.api.decorators import (
14
14
  SourceCapability,
15
15
  SupportStatus,
@@ -57,7 +57,7 @@ base.ischema_names["decimal128"] = DECIMAL128
57
57
  class MySQLConnectionConfig(SQLAlchemyConnectionConfig):
58
58
  # defaults
59
59
  host_port: str = Field(default="localhost:3306", description="MySQL host URL.")
60
- scheme: str = "mysql+pymysql"
60
+ scheme: HiddenFromDocs[str] = "mysql+pymysql"
61
61
 
62
62
 
63
63
  class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
@@ -110,10 +110,10 @@ class OracleConfig(BasicSQLAlchemyConfig):
110
110
  return v
111
111
 
112
112
  @pydantic.validator("data_dictionary_mode")
113
- def check_data_dictionary_mode(cls, values):
114
- if values not in ("ALL", "DBA"):
113
+ def check_data_dictionary_mode(cls, value):
114
+ if value not in ("ALL", "DBA"):
115
115
  raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
116
- return values
116
+ return value
117
117
 
118
118
  @pydantic.validator("thick_mode_lib_dir", always=True)
119
119
  def check_thick_mode_lib_dir(cls, v, values):
@@ -8,6 +8,7 @@ from sqlalchemy import exc, sql
8
8
  from sqlalchemy.engine import reflection
9
9
  from sqlalchemy.engine.base import Engine
10
10
 
11
+ from datahub.configuration.common import HiddenFromDocs
11
12
  from datahub.ingestion.api.common import PipelineContext
12
13
  from datahub.ingestion.api.decorators import (
13
14
  SourceCapability,
@@ -87,7 +88,7 @@ PrestoDialect._get_full_table = _get_full_table
87
88
 
88
89
  class PrestoConfig(TrinoConfig):
89
90
  # defaults
90
- scheme: str = Field(default="presto", description="", hidden_from_docs=True)
91
+ scheme: HiddenFromDocs[str] = Field(default="presto")
91
92
 
92
93
 
93
94
  @platform_name("Presto", doc_order=1)
@@ -468,23 +468,23 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
468
468
  ),
469
469
  )
470
470
 
471
- database_pattern = Field(
471
+ database_pattern: AllowDenyPattern = Field(
472
472
  default=AllowDenyPattern(deny=EXCLUDED_DATABASES),
473
473
  description="Regex patterns for databases to filter in ingestion.",
474
474
  )
475
- include_table_lineage = Field(
475
+ include_table_lineage: bool = Field(
476
476
  default=False,
477
477
  description="Whether to include table lineage in the ingestion. "
478
478
  "This requires to have the table lineage feature enabled.",
479
479
  )
480
480
 
481
- include_view_lineage = Field(
481
+ include_view_lineage: bool = Field(
482
482
  default=True,
483
483
  description="Whether to include view lineage in the ingestion. "
484
484
  "This requires to have the view lineage feature enabled.",
485
485
  )
486
486
 
487
- include_queries = Field(
487
+ include_queries: bool = Field(
488
488
  default=True,
489
489
  description="Whether to generate query entities for SQL queries. "
490
490
  "Query entities provide metadata about individual SQL queries including "
@@ -18,6 +18,7 @@ from sqlalchemy.types import TypeEngine
18
18
  from trino.sqlalchemy import datatype
19
19
  from trino.sqlalchemy.dialect import TrinoDialect
20
20
 
21
+ from datahub.configuration.common import HiddenFromDocs
21
22
  from datahub.configuration.source_common import (
22
23
  EnvConfigMixin,
23
24
  PlatformInstanceConfigMixin,
@@ -222,7 +223,7 @@ class ConnectorDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
222
223
 
223
224
  class TrinoConfig(BasicSQLAlchemyConfig):
224
225
  # defaults
225
- scheme: str = Field(default="trino", description="", hidden_from_docs=True)
226
+ scheme: HiddenFromDocs[str] = Field(default="trino")
226
227
  database: str = Field(description="database (catalog)")
227
228
 
228
229
  catalog_to_connector_details: Dict[str, ConnectorDetail] = Field(
@@ -7,7 +7,7 @@ from sqlalchemy import create_engine, inspect
7
7
  from sqlalchemy.engine import URL
8
8
  from sqlalchemy.engine.reflection import Inspector
9
9
 
10
- from datahub.configuration.common import AllowDenyPattern
10
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
11
11
  from datahub.configuration.validate_field_rename import pydantic_renamed_field
12
12
  from datahub.emitter.mcp_builder import ContainerKey
13
13
  from datahub.ingestion.api.workunit import MetadataWorkUnit
@@ -27,11 +27,10 @@ class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig):
27
27
  default=AllowDenyPattern.allow_all(),
28
28
  description="Regex patterns for databases to filter in ingestion.",
29
29
  )
30
- schema_pattern: AllowDenyPattern = Field(
30
+ schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
31
31
  # The superclass contains a `schema_pattern` field, so we need this here
32
32
  # to override the documentation.
33
33
  default=AllowDenyPattern.allow_all(),
34
- hidden_from_docs=True,
35
34
  description="Deprecated in favour of database_pattern.",
36
35
  )
37
36
 
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tupl
5
5
 
6
6
  import pydantic
7
7
  import pytest
8
- from pydantic.class_validators import validator
8
+ from pydantic import validator
9
9
  from vertica_sqlalchemy_dialect.base import VerticaInspector
10
10
 
11
11
  from datahub.configuration.common import AllowDenyPattern
@@ -8,6 +8,7 @@ from typing import ClassVar, Iterable, List, Optional, Union
8
8
 
9
9
  from pydantic import BaseModel, Field, validator
10
10
 
11
+ from datahub.configuration.common import HiddenFromDocs
11
12
  from datahub.configuration.datetimes import parse_user_datetime
12
13
  from datahub.configuration.source_common import (
13
14
  EnvConfigMixin,
@@ -66,22 +67,21 @@ class SqlQueriesSourceConfig(
66
67
  default=BaseUsageConfig(),
67
68
  )
68
69
 
69
- use_schema_resolver: bool = Field(
70
+ use_schema_resolver: HiddenFromDocs[bool] = Field(
71
+ True,
70
72
  description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.",
71
- default=True,
72
- hidden_from_docs=True,
73
73
  )
74
74
  default_db: Optional[str] = Field(
75
+ None,
75
76
  description="The default database to use for unqualified table names",
76
- default=None,
77
77
  )
78
78
  default_schema: Optional[str] = Field(
79
+ None,
79
80
  description="The default schema to use for unqualified table names",
80
- default=None,
81
81
  )
82
82
  override_dialect: Optional[str] = Field(
83
+ None,
83
84
  description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
84
- default=None,
85
85
  )
86
86
 
87
87
 
@@ -68,7 +68,11 @@ class CheckpointStateBase(ConfigModel):
68
68
 
69
69
  @staticmethod
70
70
  def _to_bytes_utf8(model: ConfigModel) -> bytes:
71
- return model.json(exclude={"version", "serde"}).encode("utf-8")
71
+ pydantic_json = model.model_dump_json(exclude={"version", "serde"})
72
+ # We decode and re-encode so that Python's default whitespace is included.
73
+ # This is purely to keep tests consistent as we migrate to pydantic v2,
74
+ # and can be removed once we're fully migrated.
75
+ return json.dumps(json.loads(pydantic_json)).encode("utf-8")
72
76
 
73
77
  @staticmethod
74
78
  def _to_bytes_base85_json(
@@ -1,4 +1,4 @@
1
- from typing import Any, Dict, Iterable, List, Tuple, Type
1
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Tuple, Type
2
2
 
3
3
  import pydantic
4
4
 
@@ -8,13 +8,16 @@ from datahub.utilities.checkpoint_state_util import CheckpointStateUtil
8
8
  from datahub.utilities.dedup_list import deduplicate_list
9
9
  from datahub.utilities.urns.urn import guess_entity_type
10
10
 
11
+ if TYPE_CHECKING:
12
+ from pydantic.deprecated.class_validators import V1RootValidator
13
+
11
14
  STATEFUL_INGESTION_IGNORED_ENTITY_TYPES = {
12
15
  "dataProcessInstance",
13
16
  "query",
14
17
  }
15
18
 
16
19
 
17
- def pydantic_state_migrator(mapping: Dict[str, str]) -> classmethod:
20
+ def pydantic_state_migrator(mapping: Dict[str, str]) -> "V1RootValidator":
18
21
  # mapping would be something like:
19
22
  # {
20
23
  # 'encoded_view_urns': 'dataset',
@@ -10,6 +10,7 @@ from datahub.configuration.common import (
10
10
  ConfigModel,
11
11
  ConfigurationError,
12
12
  DynamicTypedConfig,
13
+ HiddenFromDocs,
13
14
  )
14
15
  from datahub.configuration.pydantic_migration_helpers import GenericModel
15
16
  from datahub.configuration.time_window_config import BaseTimeWindowConfig
@@ -55,25 +56,21 @@ class StatefulIngestionConfig(ConfigModel):
55
56
  description="Whether or not to enable stateful ingest. "
56
57
  "Default: True if a pipeline_name is set and either a datahub-rest sink or `datahub_api` is specified, otherwise False",
57
58
  )
58
- max_checkpoint_state_size: pydantic.PositiveInt = Field(
59
+ max_checkpoint_state_size: HiddenFromDocs[pydantic.PositiveInt] = Field(
59
60
  default=2**24, # 16 MB
60
61
  description="The maximum size of the checkpoint state in bytes. Default is 16MB",
61
- hidden_from_docs=True,
62
62
  )
63
- state_provider: Optional[DynamicTypedStateProviderConfig] = Field(
63
+ state_provider: HiddenFromDocs[Optional[DynamicTypedStateProviderConfig]] = Field(
64
64
  default=None,
65
65
  description="The ingestion state provider configuration.",
66
- hidden_from_docs=True,
67
66
  )
68
- ignore_old_state: bool = Field(
67
+ ignore_old_state: HiddenFromDocs[bool] = Field(
69
68
  default=False,
70
69
  description="If set to True, ignores the previous checkpoint state.",
71
- hidden_from_docs=True,
72
70
  )
73
- ignore_new_state: bool = Field(
71
+ ignore_new_state: HiddenFromDocs[bool] = Field(
74
72
  default=False,
75
73
  description="If set to True, ignores the current checkpoint state.",
76
- hidden_from_docs=True,
77
74
  )
78
75
 
79
76
  @pydantic.root_validator(skip_on_failure=True)
@@ -9,8 +9,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
9
9
  import dateutil.parser as dp
10
10
  import requests
11
11
  import sqlglot
12
- from pydantic import BaseModel
13
- from pydantic.class_validators import root_validator, validator
12
+ from pydantic import BaseModel, root_validator, validator
14
13
  from pydantic.fields import Field
15
14
  from requests.adapters import HTTPAdapter
16
15
  from urllib3.util.retry import Retry
@@ -3,6 +3,7 @@ import logging
3
3
  import re
4
4
  import time
5
5
  from collections import OrderedDict, defaultdict
6
+ from copy import deepcopy
6
7
  from dataclasses import dataclass, field as dataclass_field
7
8
  from datetime import datetime, timedelta, timezone
8
9
  from functools import lru_cache
@@ -474,6 +475,13 @@ class TableauPageSizeConfig(ConfigModel):
474
475
  return self.database_table_page_size or self.page_size
475
476
 
476
477
 
478
+ _IngestHiddenAssetsOptionsType = Literal["worksheet", "dashboard"]
479
+ _IngestHiddenAssetsOptions: List[_IngestHiddenAssetsOptionsType] = [
480
+ "worksheet",
481
+ "dashboard",
482
+ ]
483
+
484
+
477
485
  class TableauConfig(
478
486
  DatasetLineageProviderConfigBase,
479
487
  StatefulIngestionConfigBase,
@@ -625,8 +633,8 @@ class TableauConfig(
625
633
  description="Configuration settings for ingesting Tableau groups and their capabilities as custom properties.",
626
634
  )
627
635
 
628
- ingest_hidden_assets: Union[List[Literal["worksheet", "dashboard"]], bool] = Field(
629
- default=["worksheet", "dashboard"],
636
+ ingest_hidden_assets: Union[List[_IngestHiddenAssetsOptionsType], bool] = Field(
637
+ _IngestHiddenAssetsOptions,
630
638
  description=(
631
639
  "When enabled, hidden worksheets and dashboards are ingested into Datahub."
632
640
  " If a dashboard or worksheet is hidden in Tableau the luid is blank."
@@ -648,6 +656,11 @@ class TableauConfig(
648
656
  # pre = True because we want to take some decision before pydantic initialize the configuration to default values
649
657
  @root_validator(pre=True)
650
658
  def projects_backward_compatibility(cls, values: Dict) -> Dict:
659
+ # In-place update of the input dict would cause state contamination. This was discovered through test failures
660
+ # in test_hex.py where the same dict is reused.
661
+ # So a copy is performed first.
662
+ values = deepcopy(values)
663
+
651
664
  projects = values.get("projects")
652
665
  project_pattern = values.get("project_pattern")
653
666
  project_path_pattern = values.get("project_path_pattern")
@@ -659,6 +672,7 @@ class TableauConfig(
659
672
  values["project_pattern"] = AllowDenyPattern(
660
673
  allow=[f"^{prj}$" for prj in projects]
661
674
  )
675
+ values.pop("projects")
662
676
  elif (project_pattern or project_path_pattern) and projects:
663
677
  raise ValueError(
664
678
  "projects is deprecated. Please use project_path_pattern only."
@@ -670,7 +684,7 @@ class TableauConfig(
670
684
 
671
685
  return values
672
686
 
673
- @root_validator()
687
+ @root_validator(skip_on_failure=True)
674
688
  def validate_config_values(cls, values: Dict) -> Dict:
675
689
  tags_for_hidden_assets = values.get("tags_for_hidden_assets")
676
690
  ingest_tags = values.get("ingest_tags")
@@ -8,7 +8,12 @@ import pydantic
8
8
  from pydantic import Field
9
9
  from typing_extensions import Literal
10
10
 
11
- from datahub.configuration.common import AllowDenyPattern, ConfigEnum, ConfigModel
11
+ from datahub.configuration.common import (
12
+ AllowDenyPattern,
13
+ ConfigEnum,
14
+ ConfigModel,
15
+ HiddenFromDocs,
16
+ )
12
17
  from datahub.configuration.source_common import (
13
18
  DatasetSourceConfigMixin,
14
19
  LowerCaseDatasetUrnConfigMixin,
@@ -285,10 +290,9 @@ class UnityCatalogSourceConfig(
285
290
  description="Limit the number of columns to get column level lineage. ",
286
291
  )
287
292
 
288
- lineage_max_workers: int = pydantic.Field(
293
+ lineage_max_workers: HiddenFromDocs[int] = pydantic.Field(
289
294
  default=5 * (os.cpu_count() or 4),
290
295
  description="Number of worker threads to use for column lineage thread pool executor. Set to 1 to disable.",
291
- hidden_from_docs=True,
292
296
  )
293
297
 
294
298
  databricks_api_page_size: int = pydantic.Field(
@@ -18,7 +18,7 @@ import pydantic
18
18
  from pydantic.fields import Field
19
19
 
20
20
  import datahub.emitter.mce_builder as builder
21
- from datahub.configuration.common import AllowDenyPattern
21
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
22
22
  from datahub.configuration.time_window_config import (
23
23
  BaseTimeWindowConfig,
24
24
  BucketDuration,
@@ -194,13 +194,13 @@ class GenericAggregatedDataset(Generic[ResourceType]):
194
194
 
195
195
 
196
196
  class BaseUsageConfig(BaseTimeWindowConfig):
197
- queries_character_limit: int = Field(
197
+ queries_character_limit: HiddenFromDocs[int] = Field(
198
+ # Hidden since we don't want to encourage people to break elasticsearch.
198
199
  default=DEFAULT_QUERIES_CHARACTER_LIMIT,
199
200
  description=(
200
201
  "Total character limit for all queries in a single usage aspect."
201
202
  " Queries will be truncated to length `queries_character_limit / top_n_queries`."
202
203
  ),
203
- hidden_from_docs=True, # Don't want to encourage people to break elasticsearch
204
204
  )
205
205
 
206
206
  top_n_queries: pydantic.PositiveInt = Field(
@@ -2,6 +2,7 @@ import re
2
2
  from typing import Dict, List, Optional, Union
3
3
  from urllib.parse import urlparse
4
4
 
5
+ import pydantic
5
6
  from pydantic import Field, validator
6
7
 
7
8
  from datahub.configuration.common import AllowDenyPattern
@@ -121,7 +122,8 @@ class PulsarSourceConfig(
121
122
  )
122
123
  return client_secret
123
124
 
124
- @validator("web_service_url")
125
+ @pydantic.field_validator("web_service_url", mode="after")
126
+ @classmethod
125
127
  def web_service_url_scheme_host_port(cls, val: str) -> str:
126
128
  # Tokenize the web url
127
129
  url = urlparse(val)
@@ -39,13 +39,7 @@ _OrFilters = List[_AndSearchFilterRule]
39
39
 
40
40
 
41
41
  class _BaseFilter(ConfigModel):
42
- class Config:
43
- # We can't wrap this in a TYPE_CHECKING block because the pydantic plugin
44
- # doesn't recognize it properly. So unfortunately we'll need to live
45
- # with the deprecation warning w/ pydantic v2.
46
- allow_population_by_field_name = True
47
- if PYDANTIC_VERSION_2:
48
- populate_by_name = True
42
+ model_config = pydantic.ConfigDict(populate_by_name=True)
49
43
 
50
44
  @abc.abstractmethod
51
45
  def compile(self) -> _OrFilters: ...