acryl-datahub 1.2.0.10rc3__py3-none-any.whl → 1.2.0.10rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (82) hide show
  1. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/METADATA +2668 -2752
  2. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/RECORD +82 -82
  3. datahub/_version.py +1 -1
  4. datahub/api/entities/assertion/assertion.py +1 -1
  5. datahub/api/entities/corpgroup/corpgroup.py +1 -1
  6. datahub/api/entities/dataproduct/dataproduct.py +6 -3
  7. datahub/api/entities/dataset/dataset.py +9 -18
  8. datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
  9. datahub/api/graphql/operation.py +10 -6
  10. datahub/cli/docker_check.py +2 -2
  11. datahub/configuration/common.py +29 -1
  12. datahub/configuration/connection_resolver.py +5 -2
  13. datahub/configuration/import_resolver.py +7 -4
  14. datahub/configuration/pydantic_migration_helpers.py +0 -9
  15. datahub/configuration/source_common.py +3 -2
  16. datahub/configuration/validate_field_deprecation.py +5 -2
  17. datahub/configuration/validate_field_removal.py +5 -2
  18. datahub/configuration/validate_field_rename.py +6 -5
  19. datahub/configuration/validate_multiline_string.py +5 -2
  20. datahub/ingestion/run/pipeline_config.py +2 -2
  21. datahub/ingestion/source/azure/azure_common.py +1 -1
  22. datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
  23. datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
  24. datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
  25. datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
  26. datahub/ingestion/source/datahub/config.py +8 -9
  27. datahub/ingestion/source/delta_lake/config.py +1 -1
  28. datahub/ingestion/source/dremio/dremio_config.py +3 -4
  29. datahub/ingestion/source/feast.py +8 -10
  30. datahub/ingestion/source/fivetran/config.py +1 -1
  31. datahub/ingestion/source/ge_profiling_config.py +26 -22
  32. datahub/ingestion/source/grafana/grafana_config.py +2 -2
  33. datahub/ingestion/source/grafana/models.py +12 -14
  34. datahub/ingestion/source/hex/hex.py +6 -1
  35. datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
  36. datahub/ingestion/source/kafka_connect/common.py +2 -2
  37. datahub/ingestion/source/looker/looker_common.py +1 -1
  38. datahub/ingestion/source/looker/looker_config.py +15 -4
  39. datahub/ingestion/source/looker/lookml_config.py +1 -1
  40. datahub/ingestion/source/metadata/business_glossary.py +7 -7
  41. datahub/ingestion/source/metadata/lineage.py +1 -1
  42. datahub/ingestion/source/mode.py +13 -5
  43. datahub/ingestion/source/nifi.py +1 -1
  44. datahub/ingestion/source/powerbi/config.py +14 -21
  45. datahub/ingestion/source/preset.py +1 -1
  46. datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
  47. datahub/ingestion/source/redshift/config.py +6 -3
  48. datahub/ingestion/source/salesforce.py +13 -9
  49. datahub/ingestion/source/schema/json_schema.py +14 -14
  50. datahub/ingestion/source/sigma/data_classes.py +3 -0
  51. datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
  52. datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
  53. datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
  54. datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
  55. datahub/ingestion/source/sql/athena.py +2 -1
  56. datahub/ingestion/source/sql/clickhouse.py +12 -7
  57. datahub/ingestion/source/sql/cockroachdb.py +5 -3
  58. datahub/ingestion/source/sql/druid.py +2 -2
  59. datahub/ingestion/source/sql/hive.py +4 -3
  60. datahub/ingestion/source/sql/hive_metastore.py +7 -9
  61. datahub/ingestion/source/sql/mssql/source.py +2 -2
  62. datahub/ingestion/source/sql/mysql.py +2 -2
  63. datahub/ingestion/source/sql/oracle.py +3 -3
  64. datahub/ingestion/source/sql/presto.py +2 -1
  65. datahub/ingestion/source/sql/teradata.py +4 -4
  66. datahub/ingestion/source/sql/trino.py +2 -1
  67. datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
  68. datahub/ingestion/source/sql/vertica.py +1 -1
  69. datahub/ingestion/source/sql_queries.py +6 -6
  70. datahub/ingestion/source/state/checkpoint.py +5 -1
  71. datahub/ingestion/source/state/entity_removal_state.py +5 -2
  72. datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
  73. datahub/ingestion/source/superset.py +1 -2
  74. datahub/ingestion/source/tableau/tableau.py +17 -3
  75. datahub/ingestion/source/unity/config.py +7 -3
  76. datahub/ingestion/source/usage/usage_common.py +3 -3
  77. datahub/ingestion/source_config/pulsar.py +3 -1
  78. datahub/sdk/search_filters.py +1 -7
  79. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/WHEEL +0 -0
  80. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/entry_points.txt +0 -0
  81. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/licenses/LICENSE +0 -0
  82. {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,7 @@ from cached_property import cached_property
11
11
  from pydantic.fields import Field
12
12
  from wcmatch import pathlib
13
13
 
14
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
14
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
15
15
  from datahub.ingestion.source.aws.s3_util import is_s3_uri
16
16
  from datahub.ingestion.source.azure.abs_utils import is_abs_uri
17
17
  from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
@@ -89,63 +89,62 @@ class PathSpec(ConfigModel):
89
89
  description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details."
90
90
  )
91
91
  exclude: Optional[List[str]] = Field(
92
- default=[],
92
+ [],
93
93
  description="list of paths in glob pattern which will be excluded while scanning for the datasets",
94
94
  )
95
95
  file_types: List[str] = Field(
96
- default=SUPPORTED_FILE_TYPES,
96
+ SUPPORTED_FILE_TYPES,
97
97
  description="Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.",
98
98
  )
99
99
 
100
100
  default_extension: Optional[str] = Field(
101
- default=None,
101
+ None,
102
102
  description="For files without extension it will assume the specified file type. If it is not set the files without extensions will be skipped.",
103
103
  )
104
104
 
105
105
  table_name: Optional[str] = Field(
106
- default=None,
106
+ None,
107
107
  description="Display name of the dataset.Combination of named variables from include path and strings",
108
108
  )
109
109
 
110
110
  # This is not used yet, but will be used in the future to sort the partitions
111
- sort_key: Optional[SortKey] = Field(
112
- hidden_from_docs=True,
113
- default=None,
111
+ sort_key: HiddenFromDocs[Optional[SortKey]] = Field(
112
+ None,
114
113
  description="Sort key to use when sorting the partitions. This is useful when the partitions are not sorted in the order of the data. The key can be a compound key based on the path_spec variables.",
115
114
  )
116
115
 
117
116
  enable_compression: bool = Field(
118
- default=True,
117
+ True,
119
118
  description="Enable or disable processing compressed files. Currently .gz and .bz files are supported.",
120
119
  )
121
120
 
122
121
  sample_files: bool = Field(
123
- default=True,
122
+ True,
124
123
  description="Not listing all the files but only taking a handful amount of sample file to infer the schema. File count and file size calculation will be disabled. This can affect performance significantly if enabled",
125
124
  )
126
125
 
127
126
  allow_double_stars: bool = Field(
128
- default=False,
127
+ False,
129
128
  description="Allow double stars in the include path. This can affect performance significantly if enabled",
130
129
  )
131
130
 
132
131
  autodetect_partitions: bool = Field(
133
- default=True,
132
+ True,
134
133
  description="Autodetect partition(s) from the path. If set to true, it will autodetect partition key/value if the folder format is {partition_key}={partition_value} for example `year=2024`",
135
134
  )
136
135
 
137
136
  traversal_method: FolderTraversalMethod = Field(
138
- default=FolderTraversalMethod.MAX,
137
+ FolderTraversalMethod.MAX,
139
138
  description="Method to traverse the folder. ALL: Traverse all the folders, MIN_MAX: Traverse the folders by finding min and max value, MAX: Traverse the folder with max value",
140
139
  )
141
140
 
142
141
  include_hidden_folders: bool = Field(
143
- default=False,
142
+ False,
144
143
  description="Include hidden folders in the traversal (folders starting with . or _",
145
144
  )
146
145
 
147
146
  tables_filter_pattern: AllowDenyPattern = Field(
148
- default=AllowDenyPattern.allow_all(),
147
+ AllowDenyPattern.allow_all(),
149
148
  description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.",
150
149
  )
151
150
 
@@ -479,7 +478,8 @@ class PathSpec(ConfigModel):
479
478
  return glob_include
480
479
 
481
480
  @pydantic.root_validator(skip_on_failure=True)
482
- def validate_path_spec(cls, values: Dict) -> Dict[str, Any]:
481
+ @staticmethod
482
+ def validate_path_spec(values: Dict) -> Dict[str, Any]:
483
483
  # validate that main fields are populated
484
484
  required_fields = ["include", "file_types", "default_extension"]
485
485
  for f in required_fields:
@@ -4,7 +4,7 @@ from typing import Optional, Set
4
4
  import pydantic
5
5
  from pydantic import Field, root_validator
6
6
 
7
- from datahub.configuration.common import AllowDenyPattern
7
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
8
8
  from datahub.configuration.kafka import KafkaConsumerConnectionConfig
9
9
  from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
10
10
  from datahub.ingestion.source.state.stateful_ingestion_base import (
@@ -98,16 +98,14 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
98
98
  ),
99
99
  )
100
100
 
101
- pull_from_datahub_api: bool = Field(
101
+ pull_from_datahub_api: HiddenFromDocs[bool] = Field(
102
102
  default=False,
103
103
  description="Use the DataHub API to fetch versioned aspects.",
104
- hidden_from_docs=True,
105
104
  )
106
105
 
107
- max_workers: int = Field(
106
+ max_workers: HiddenFromDocs[int] = Field(
108
107
  default=5 * (os.cpu_count() or 4),
109
108
  description="Number of worker threads to use for datahub api ingestion.",
110
- hidden_from_docs=True,
111
109
  )
112
110
 
113
111
  urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
@@ -118,10 +116,11 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
118
116
  "Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
119
117
  )
120
118
 
121
- structured_properties_template_cache_invalidation_interval: int = Field(
122
- hidden_from_docs=True,
123
- default=60,
124
- description="Interval in seconds to invalidate the structured properties template cache.",
119
+ structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
120
+ Field(
121
+ default=60,
122
+ description="Interval in seconds to invalidate the structured properties template cache.",
123
+ )
125
124
  )
126
125
 
127
126
  query_timeout: Optional[int] = Field(
@@ -78,7 +78,7 @@ class DeltaLakeSourceConfig(
78
78
  "When set to `False`, number_of_files in delta table can not be reported.",
79
79
  )
80
80
 
81
- s3: Optional[S3] = Field()
81
+ s3: Optional[S3] = Field(None)
82
82
 
83
83
  @cached_property
84
84
  def is_s3(self):
@@ -4,7 +4,7 @@ from typing import List, Literal, Optional
4
4
  import certifi
5
5
  from pydantic import Field, validator
6
6
 
7
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
7
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
8
8
  from datahub.configuration.source_common import (
9
9
  EnvConfigMixin,
10
10
  PlatformInstanceConfigMixin,
@@ -100,10 +100,9 @@ class ProfileConfig(GEProfilingBaseConfig):
100
100
  query_timeout: int = Field(
101
101
  default=300, description="Time before cancelling Dremio profiling query"
102
102
  )
103
- include_field_median_value: bool = Field(
103
+ include_field_median_value: HiddenFromDocs[bool] = Field(
104
+ # Hidden because median causes a number of issues in Dremio.
104
105
  default=False,
105
- hidden_from_docs=True,
106
- description="Median causes a number of issues in Dremio.",
107
106
  )
108
107
 
109
108
 
@@ -1,5 +1,6 @@
1
+ import pathlib
1
2
  from dataclasses import dataclass
2
- from typing import Dict, Iterable, List, Optional, Tuple, Union
3
+ from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Union
3
4
 
4
5
  import feast.types
5
6
  from feast import (
@@ -97,7 +98,7 @@ class FeastRepositorySourceConfig(
97
98
  StatefulIngestionConfigBase,
98
99
  ):
99
100
  path: str = Field(description="Path to Feast repository")
100
- fs_yaml_file: Optional[str] = Field(
101
+ fs_yaml_file: Optional[pathlib.Path] = Field(
101
102
  default=None,
102
103
  description="Path to the `feature_store.yaml` file used to configure the feature store",
103
104
  )
@@ -142,17 +143,14 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
142
143
  - Column types associated with each entity and feature
143
144
  """
144
145
 
145
- platform = "feast"
146
- source_config: FeastRepositorySourceConfig
147
- report: StaleEntityRemovalSourceReport
148
- feature_store: FeatureStore
146
+ platform: ClassVar[str] = "feast"
149
147
 
150
148
  def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
151
149
  super().__init__(config, ctx)
152
- self.source_config = config
153
- self.ctx = ctx
154
- self.report = StaleEntityRemovalSourceReport()
155
- self.feature_store = FeatureStore(
150
+ self.source_config: FeastRepositorySourceConfig = config
151
+ self.ctx: PipelineContext = ctx
152
+ self.report: StaleEntityRemovalSourceReport = StaleEntityRemovalSourceReport()
153
+ self.feature_store: FeatureStore = FeatureStore(
156
154
  repo_path=self.source_config.path,
157
155
  fs_yaml_file=self.source_config.fs_yaml_file,
158
156
  )
@@ -102,7 +102,7 @@ class FivetranLogConfig(ConfigModel):
102
102
  "destination_config", "snowflake_destination_config"
103
103
  )
104
104
 
105
- @root_validator(pre=True)
105
+ @root_validator(skip_on_failure=True)
106
106
  def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
107
107
  destination_platform = values["destination_platform"]
108
108
  if destination_platform == "snowflake":
@@ -1,12 +1,12 @@
1
1
  import datetime
2
2
  import logging
3
3
  import os
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Annotated, Any, Dict, List, Optional
5
5
 
6
6
  import pydantic
7
7
  from pydantic.fields import Field
8
8
 
9
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
9
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, SupportedSources
10
10
  from datahub.ingestion.source_config.operation_config import OperationConfig
11
11
 
12
12
  _PROFILING_FLAGS_TO_REPORT = {
@@ -120,37 +120,40 @@ class GEProfilingConfig(GEProfilingBaseConfig):
120
120
  "number of columns to profile goes up.",
121
121
  )
122
122
 
123
- profile_if_updated_since_days: Optional[pydantic.PositiveFloat] = Field(
123
+ profile_if_updated_since_days: Annotated[
124
+ Optional[pydantic.PositiveFloat], SupportedSources(["snowflake", "bigquery"])
125
+ ] = Field(
124
126
  default=None,
125
127
  description="Profile table only if it has been updated since these many number of days. "
126
128
  "If set to `null`, no constraint of last modified time for tables to profile. "
127
129
  "Supported only in `snowflake` and `BigQuery`.",
128
- schema_extra={"supported_sources": ["snowflake", "bigquery"]},
129
130
  )
130
131
 
131
- profile_table_size_limit: Optional[int] = Field(
132
+ profile_table_size_limit: Annotated[
133
+ Optional[int],
134
+ SupportedSources(["snowflake", "bigquery", "unity-catalog", "oracle"]),
135
+ ] = Field(
132
136
  default=5,
133
137
  description="Profile tables only if their size is less than specified GBs. If set to `null`, "
134
138
  "no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
135
139
  "`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
136
- schema_extra={
137
- "supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
138
- },
139
140
  )
140
141
 
141
- profile_table_row_limit: Optional[int] = Field(
142
+ profile_table_row_limit: Annotated[
143
+ Optional[int], SupportedSources(["snowflake", "bigquery", "oracle"])
144
+ ] = Field(
142
145
  default=5000000,
143
146
  description="Profile tables only if their row count is less than specified count. "
144
147
  "If set to `null`, no limit on the row count of tables to profile. Supported only in "
145
148
  "`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
146
- schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
147
149
  )
148
150
 
149
- profile_table_row_count_estimate_only: bool = Field(
151
+ profile_table_row_count_estimate_only: Annotated[
152
+ bool, SupportedSources(["postgres", "mysql"])
153
+ ] = Field(
150
154
  default=False,
151
155
  description="Use an approximate query for row count. This will be much faster but slightly "
152
156
  "less accurate. Only supported for Postgres and MySQL. ",
153
- schema_extra={"supported_sources": ["postgres", "mysql"]},
154
157
  )
155
158
 
156
159
  # The query combiner enables us to combine multiple queries into a single query,
@@ -163,36 +166,37 @@ class GEProfilingConfig(GEProfilingBaseConfig):
163
166
  # Hidden option - used for debugging purposes.
164
167
  catch_exceptions: bool = Field(default=True, description="")
165
168
 
166
- partition_profiling_enabled: bool = Field(
169
+ partition_profiling_enabled: Annotated[
170
+ bool, SupportedSources(["athena", "bigquery"])
171
+ ] = Field(
167
172
  default=True,
168
173
  description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
169
174
  "If enabled, latest partition data is used for profiling.",
170
- schema_extra={"supported_sources": ["athena", "bigquery"]},
171
175
  )
172
- partition_datetime: Optional[datetime.datetime] = Field(
176
+ partition_datetime: Annotated[
177
+ Optional[datetime.datetime], SupportedSources(["bigquery"])
178
+ ] = Field(
173
179
  default=None,
174
180
  description="If specified, profile only the partition which matches this datetime. "
175
181
  "If not specified, profile the latest partition. Only Bigquery supports this.",
176
- schema_extra={"supported_sources": ["bigquery"]},
177
182
  )
178
- use_sampling: bool = Field(
183
+ use_sampling: Annotated[bool, SupportedSources(["bigquery", "snowflake"])] = Field(
179
184
  default=True,
180
185
  description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
181
186
  "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
182
- schema_extra={"supported_sources": ["bigquery", "snowflake"]},
183
187
  )
184
188
 
185
- sample_size: int = Field(
189
+ sample_size: Annotated[int, SupportedSources(["bigquery", "snowflake"])] = Field(
186
190
  default=10000,
187
191
  description="Number of rows to be sampled from table for column level profiling."
188
192
  "Applicable only if `use_sampling` is set to True.",
189
- schema_extra={"supported_sources": ["bigquery", "snowflake"]},
190
193
  )
191
194
 
192
- profile_external_tables: bool = Field(
195
+ profile_external_tables: Annotated[
196
+ bool, SupportedSources(["redshift", "snowflake"])
197
+ ] = Field(
193
198
  default=False,
194
199
  description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
195
- schema_extra={"supported_sources": ["redshift", "snowflake"]},
196
200
  )
197
201
 
198
202
  tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(
@@ -2,7 +2,7 @@ from typing import Dict, Optional
2
2
 
3
3
  from pydantic import Field, SecretStr, validator
4
4
 
5
- from datahub.configuration.common import AllowDenyPattern
5
+ from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
6
6
  from datahub.configuration.source_common import (
7
7
  DatasetLineageProviderConfigBase,
8
8
  EnvConfigMixin,
@@ -37,7 +37,7 @@ class GrafanaSourceConfig(
37
37
  ):
38
38
  """Configuration for Grafana source"""
39
39
 
40
- platform: str = Field(default="grafana", hidden_from_docs=True)
40
+ platform: HiddenFromDocs[str] = Field(default="grafana")
41
41
  url: str = Field(
42
42
  description="Grafana URL in the format http://your-grafana-instance with no trailing slash"
43
43
  )
@@ -10,9 +10,8 @@ References:
10
10
 
11
11
  from typing import Any, Dict, List, Optional
12
12
 
13
- from pydantic import BaseModel, Field
13
+ from pydantic import BaseModel, ConfigDict, Field
14
14
 
15
- from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
16
15
  from datahub.emitter.mcp_builder import ContainerKey
17
16
 
18
17
  # Grafana-specific type definitions for better type safety
@@ -25,7 +24,11 @@ GrafanaFieldConfig = Dict[
25
24
  GrafanaTransformation = Dict[str, Any] # Transformations: id, options
26
25
 
27
26
 
28
- class DatasourceRef(BaseModel):
27
+ class _GrafanaBaseModel(BaseModel):
28
+ model_config = ConfigDict(coerce_numbers_to_str=True)
29
+
30
+
31
+ class DatasourceRef(_GrafanaBaseModel):
29
32
  """Reference to a Grafana datasource."""
30
33
 
31
34
  type: Optional[str] = None # Datasource type (prometheus, mysql, postgres, etc.)
@@ -33,13 +36,13 @@ class DatasourceRef(BaseModel):
33
36
  name: Optional[str] = None # Datasource display name
34
37
 
35
38
 
36
- class Panel(BaseModel):
39
+ class Panel(_GrafanaBaseModel):
37
40
  """Represents a Grafana dashboard panel."""
38
41
 
39
42
  id: str
40
43
  title: str
41
44
  description: str = ""
42
- type: Optional[str]
45
+ type: Optional[str] = None
43
46
  # Query targets - each contains refId (A,B,C...), query/expr, datasource ref, etc.
44
47
  query_targets: List[GrafanaQueryTarget] = Field(
45
48
  default_factory=list, alias="targets"
@@ -52,16 +55,16 @@ class Panel(BaseModel):
52
55
  transformations: List[GrafanaTransformation] = Field(default_factory=list)
53
56
 
54
57
 
55
- class Dashboard(BaseModel):
58
+ class Dashboard(_GrafanaBaseModel):
56
59
  """Represents a Grafana dashboard."""
57
60
 
58
61
  uid: str
59
62
  title: str
60
63
  description: str = ""
61
- version: Optional[str]
64
+ version: Optional[str] = None
62
65
  panels: List[Panel]
63
66
  tags: List[str]
64
- timezone: Optional[str]
67
+ timezone: Optional[str] = None
65
68
  refresh: Optional[str] = None
66
69
  schema_version: Optional[str] = Field(default=None, alias="schemaVersion")
67
70
  folder_id: Optional[str] = Field(default=None, alias="meta.folderId")
@@ -100,18 +103,13 @@ class Dashboard(BaseModel):
100
103
  return super().parse_obj(dashboard_dict)
101
104
 
102
105
 
103
- class Folder(BaseModel):
106
+ class Folder(_GrafanaBaseModel):
104
107
  """Represents a Grafana folder."""
105
108
 
106
109
  id: str
107
110
  title: str
108
111
  description: Optional[str] = ""
109
112
 
110
- if PYDANTIC_VERSION_2:
111
- from pydantic import ConfigDict
112
-
113
- model_config = ConfigDict(coerce_numbers_to_str=True) # type: ignore
114
-
115
113
 
116
114
  class FolderKey(ContainerKey):
117
115
  """Key for identifying a Grafana folder."""
@@ -1,3 +1,4 @@
1
+ from copy import deepcopy
1
2
  from dataclasses import dataclass
2
3
  from datetime import datetime, timedelta, timezone
3
4
  from typing import Any, Dict, Iterable, List, Optional
@@ -122,7 +123,11 @@ class HexSourceConfig(
122
123
 
123
124
  @root_validator(pre=True)
124
125
  def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
125
- # lineage_end_time default = now
126
+ # In-place update of the input dict would cause state contamination. This was discovered through test failures
127
+ # in test_hex.py where the same dict is reused.
128
+ # So a deepcopy is performed first.
129
+ data = deepcopy(data)
130
+
126
131
  if "lineage_end_time" not in data or data["lineage_end_time"] is None:
127
132
  data["lineage_end_time"] = datetime.now(tz=timezone.utc)
128
133
  # if string is given, parse it
@@ -12,6 +12,7 @@ from pyiceberg.types import (
12
12
  IcebergType,
13
13
  IntegerType,
14
14
  LongType,
15
+ PrimitiveType,
15
16
  TimestampType,
16
17
  TimestamptzType,
17
18
  TimeType,
@@ -22,6 +23,7 @@ from pyiceberg.utils.datetime import (
22
23
  to_human_timestamp,
23
24
  to_human_timestamptz,
24
25
  )
26
+ from typing_extensions import TypeGuard
25
27
 
26
28
  from datahub.emitter.mce_builder import get_sys_time
27
29
  from datahub.ingestion.source.iceberg.iceberg_common import (
@@ -65,7 +67,7 @@ class IcebergProfiler:
65
67
  aggregated_values: Dict[int, Any],
66
68
  manifest_values: Dict[int, bytes],
67
69
  ) -> None:
68
- for field_id, value_encoded in manifest_values.items(): # type: int, Any
70
+ for field_id, value_encoded in manifest_values.items():
69
71
  try:
70
72
  field = schema.find_field(field_id)
71
73
  except ValueError:
@@ -240,7 +242,7 @@ class IcebergProfiler:
240
242
  return None
241
243
 
242
244
  @staticmethod
243
- def _is_numeric_type(type: IcebergType) -> bool:
245
+ def _is_numeric_type(type: IcebergType) -> TypeGuard[PrimitiveType]:
244
246
  return isinstance(
245
247
  type,
246
248
  (
@@ -4,7 +4,7 @@ from typing import Dict, Iterable, List, Optional
4
4
 
5
5
  from pydantic.fields import Field
6
6
 
7
- from datahub.configuration.common import AllowDenyPattern, ConfigModel
7
+ from datahub.configuration.common import AllowDenyPattern, ConfigModel, LaxStr
8
8
  from datahub.configuration.source_common import (
9
9
  DatasetLineageProviderConfigBase,
10
10
  PlatformInstanceConfigMixin,
@@ -29,7 +29,7 @@ CONNECTOR_CLASS = "connector.class"
29
29
  class ProvidedConfig(ConfigModel):
30
30
  provider: str
31
31
  path_key: str
32
- value: str
32
+ value: LaxStr
33
33
 
34
34
 
35
35
  class GenericConnectorConfig(ConfigModel):
@@ -28,7 +28,7 @@ from looker_sdk.sdk.api40.models import (
28
28
  User,
29
29
  WriteQuery,
30
30
  )
31
- from pydantic.class_validators import validator
31
+ from pydantic import validator
32
32
 
33
33
  import datahub.emitter.mce_builder as builder
34
34
  from datahub.api.entities.platformresource.platform_resource import (
@@ -5,10 +5,14 @@ from typing import Any, ClassVar, Dict, List, Optional, Tuple, Union, cast
5
5
 
6
6
  import pydantic
7
7
  from looker_sdk.sdk.api40.models import DBConnection
8
- from pydantic import Field, validator
8
+ from pydantic import Field, model_validator, validator
9
9
 
10
10
  from datahub.configuration import ConfigModel
11
- from datahub.configuration.common import AllowDenyPattern, ConfigurationError
11
+ from datahub.configuration.common import (
12
+ AllowDenyPattern,
13
+ ConfigurationError,
14
+ HiddenFromDocs,
15
+ )
12
16
  from datahub.configuration.source_common import (
13
17
  EnvConfigMixin,
14
18
  PlatformInstanceConfigMixin,
@@ -43,6 +47,14 @@ class NamingPattern(ConfigModel):
43
47
  assert isinstance(v, str), "pattern must be a string"
44
48
  return {"pattern": v}
45
49
 
50
+ @model_validator(mode="before")
51
+ @classmethod
52
+ def pydantic_v2_accept_raw_pattern(cls, v):
53
+ # Pydantic v2 compatibility: handle string input by converting to dict
54
+ if isinstance(v, str):
55
+ return {"pattern": v}
56
+ return v
57
+
46
58
  @classmethod
47
59
  def pydantic_validate_pattern(cls, v):
48
60
  assert isinstance(v, NamingPattern)
@@ -132,11 +144,10 @@ class LookerCommonConfig(EnvConfigMixin, PlatformInstanceConfigMixin):
132
144
  description="When enabled, attaches tags to measures, dimensions and dimension groups to make them more "
133
145
  "discoverable. When disabled, adds this information to the description of the column.",
134
146
  )
135
- platform_name: str = Field(
147
+ platform_name: HiddenFromDocs[str] = Field(
136
148
  # TODO: This shouldn't be part of the config.
137
149
  "looker",
138
150
  description="Default platform name.",
139
- hidden_from_docs=True,
140
151
  )
141
152
  extract_column_level_lineage: bool = Field(
142
153
  True,
@@ -122,7 +122,7 @@ class LookMLSourceConfig(
122
122
  description="List of regex patterns for LookML views to include in the extraction.",
123
123
  )
124
124
  parse_table_names_from_sql: bool = Field(True, description="See note below.")
125
- api: Optional[LookerAPIConfig]
125
+ api: Optional[LookerAPIConfig] = None
126
126
  project_name: Optional[str] = Field(
127
127
  None,
128
128
  description="Required if you don't specify the `api` section. The project name within which all the model "
@@ -5,11 +5,11 @@ import time
5
5
  from dataclasses import dataclass, field
6
6
  from typing import Any, Dict, Iterable, List, Optional, TypeVar, Union
7
7
 
8
- from pydantic import validator
8
+ import pydantic
9
9
  from pydantic.fields import Field
10
10
 
11
11
  import datahub.metadata.schema_classes as models
12
- from datahub.configuration.common import ConfigModel
12
+ from datahub.configuration.common import ConfigModel, LaxStr
13
13
  from datahub.configuration.config_loader import load_config_file
14
14
  from datahub.emitter.mce_builder import (
15
15
  datahub_guid,
@@ -66,7 +66,7 @@ class GlossaryTermConfig(ConfigModel):
66
66
  contains: Optional[List[str]] = None
67
67
  values: Optional[List[str]] = None
68
68
  related_terms: Optional[List[str]] = None
69
- custom_properties: Optional[Dict[str, str]] = None
69
+ custom_properties: Optional[Dict[str, LaxStr]] = None
70
70
  knowledge_links: Optional[List[KnowledgeCard]] = None
71
71
  domain: Optional[str] = None
72
72
 
@@ -82,7 +82,7 @@ class GlossaryNodeConfig(ConfigModel):
82
82
  terms: Optional[List["GlossaryTermConfig"]] = None
83
83
  nodes: Optional[List["GlossaryNodeConfig"]] = None
84
84
  knowledge_links: Optional[List[KnowledgeCard]] = None
85
- custom_properties: Optional[Dict[str, str]] = None
85
+ custom_properties: Optional[Dict[str, LaxStr]] = None
86
86
 
87
87
  # Private fields.
88
88
  _urn: str
@@ -108,12 +108,12 @@ class BusinessGlossarySourceConfig(ConfigModel):
108
108
 
109
109
 
110
110
  class BusinessGlossaryConfig(DefaultConfig):
111
- version: str
111
+ version: LaxStr
112
112
  terms: Optional[List["GlossaryTermConfig"]] = None
113
113
  nodes: Optional[List["GlossaryNodeConfig"]] = None
114
114
 
115
- @validator("version")
116
- def version_must_be_1(cls, v):
115
+ @pydantic.field_validator("version", mode="after")
116
+ def version_must_be_1(cls, v: str) -> str:
117
117
  if v != "1":
118
118
  raise ValueError("Only version 1 is supported")
119
119
  return v
@@ -49,7 +49,7 @@ class EntityConfig(EnvConfigMixin):
49
49
  name: str
50
50
  type: str
51
51
  platform: str
52
- platform_instance: Optional[str]
52
+ platform_instance: Optional[str] = None
53
53
 
54
54
  @validator("type")
55
55
  def type_must_be_supported(cls, v: str) -> str: