acryl-datahub 1.2.0.10rc3__py3-none-any.whl → 1.2.0.10rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/METADATA +2668 -2752
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/RECORD +82 -82
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +6 -3
- datahub/api/entities/dataset/dataset.py +9 -18
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/docker_check.py +2 -2
- datahub/configuration/common.py +29 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/pydantic_migration_helpers.py +0 -9
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +5 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/source/azure/azure_common.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
- datahub/ingestion/source/datahub/config.py +8 -9
- datahub/ingestion/source/delta_lake/config.py +1 -1
- datahub/ingestion/source/dremio/dremio_config.py +3 -4
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/ge_profiling_config.py +26 -22
- datahub/ingestion/source/grafana/grafana_config.py +2 -2
- datahub/ingestion/source/grafana/models.py +12 -14
- datahub/ingestion/source/hex/hex.py +6 -1
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/looker/looker_common.py +1 -1
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/lookml_config.py +1 -1
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +1 -1
- datahub/ingestion/source/mode.py +13 -5
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +14 -21
- datahub/ingestion/source/preset.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/redshift/config.py +6 -3
- datahub/ingestion/source/salesforce.py +13 -9
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
- datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +12 -7
- datahub/ingestion/source/sql/cockroachdb.py +5 -3
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +7 -9
- datahub/ingestion/source/sql/mssql/source.py +2 -2
- datahub/ingestion/source/sql/mysql.py +2 -2
- datahub/ingestion/source/sql/oracle.py +3 -3
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/teradata.py +4 -4
- datahub/ingestion/source/sql/trino.py +2 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +1 -1
- datahub/ingestion/source/sql_queries.py +6 -6
- datahub/ingestion/source/state/checkpoint.py +5 -1
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
- datahub/ingestion/source/superset.py +1 -2
- datahub/ingestion/source/tableau/tableau.py +17 -3
- datahub/ingestion/source/unity/config.py +7 -3
- datahub/ingestion/source/usage/usage_common.py +3 -3
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/sdk/search_filters.py +1 -7
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.10rc3.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/top_level.txt +0 -0
|
@@ -27,7 +27,7 @@ from typing_extensions import TypeAlias
|
|
|
27
27
|
|
|
28
28
|
import datahub.metadata.schema_classes as models
|
|
29
29
|
from datahub.api.entities.structuredproperties.structuredproperties import AllowedTypes
|
|
30
|
-
from datahub.configuration.common import ConfigModel
|
|
30
|
+
from datahub.configuration.common import ConfigModel, LaxStr
|
|
31
31
|
from datahub.emitter.mce_builder import (
|
|
32
32
|
make_data_platform_urn,
|
|
33
33
|
make_dataset_urn,
|
|
@@ -143,7 +143,6 @@ class SchemaFieldSpecification(StrictModel):
|
|
|
143
143
|
jsonPath: Union[None, str] = None
|
|
144
144
|
nullable: bool = False
|
|
145
145
|
description: Union[None, str] = None
|
|
146
|
-
doc: Union[None, str] = None # doc is an alias for description
|
|
147
146
|
label: Optional[str] = None
|
|
148
147
|
created: Optional[dict] = None
|
|
149
148
|
lastModified: Optional[dict] = None
|
|
@@ -221,14 +220,14 @@ class SchemaFieldSpecification(StrictModel):
|
|
|
221
220
|
return v
|
|
222
221
|
|
|
223
222
|
@root_validator(pre=True)
|
|
224
|
-
def
|
|
225
|
-
"""Synchronize doc
|
|
223
|
+
def sync_doc_into_description(cls, values: Dict) -> Dict:
|
|
224
|
+
"""Synchronize doc into description field if doc is provided."""
|
|
226
225
|
description = values.get("description")
|
|
227
|
-
doc = values.
|
|
226
|
+
doc = values.pop("doc", None)
|
|
228
227
|
|
|
229
|
-
if
|
|
230
|
-
|
|
231
|
-
|
|
228
|
+
if doc is not None:
|
|
229
|
+
if description is not None:
|
|
230
|
+
raise ValueError("doc and description cannot both be provided")
|
|
232
231
|
values["description"] = doc
|
|
233
232
|
|
|
234
233
|
return values
|
|
@@ -296,10 +295,6 @@ class SchemaFieldSpecification(StrictModel):
|
|
|
296
295
|
"""Custom dict method for Pydantic v1 to handle YAML serialization properly."""
|
|
297
296
|
exclude = kwargs.pop("exclude", None) or set()
|
|
298
297
|
|
|
299
|
-
# If description and doc are identical, exclude doc from the output
|
|
300
|
-
if self.description == self.doc and self.description is not None:
|
|
301
|
-
exclude.add("doc")
|
|
302
|
-
|
|
303
298
|
# if nativeDataType and type are identical, exclude nativeDataType from the output
|
|
304
299
|
if self.nativeDataType == self.type and self.nativeDataType is not None:
|
|
305
300
|
exclude.add("nativeDataType")
|
|
@@ -327,10 +322,6 @@ class SchemaFieldSpecification(StrictModel):
|
|
|
327
322
|
"""Custom model_dump method for Pydantic v2 to handle YAML serialization properly."""
|
|
328
323
|
exclude = kwargs.pop("exclude", None) or set()
|
|
329
324
|
|
|
330
|
-
# If description and doc are identical, exclude doc from the output
|
|
331
|
-
if self.description == self.doc and self.description is not None:
|
|
332
|
-
exclude.add("doc")
|
|
333
|
-
|
|
334
325
|
# if nativeDataType and type are identical, exclude nativeDataType from the output
|
|
335
326
|
if self.nativeDataType == self.type and self.nativeDataType is not None:
|
|
336
327
|
exclude.add("nativeDataType")
|
|
@@ -387,7 +378,7 @@ class Dataset(StrictModel):
|
|
|
387
378
|
name: Optional[str] = Field(None, validate_default=True)
|
|
388
379
|
schema_metadata: Optional[SchemaSpecification] = Field(default=None, alias="schema")
|
|
389
380
|
downstreams: Optional[List[str]] = None
|
|
390
|
-
properties: Optional[Dict[str,
|
|
381
|
+
properties: Optional[Dict[str, LaxStr]] = None
|
|
391
382
|
subtype: Optional[str] = None
|
|
392
383
|
subtypes: Optional[List[str]] = None
|
|
393
384
|
tags: Optional[List[str]] = None
|
|
@@ -605,7 +596,7 @@ class Dataset(StrictModel):
|
|
|
605
596
|
],
|
|
606
597
|
platformSchema=OtherSchemaClass(
|
|
607
598
|
rawSchema=yaml.dump(
|
|
608
|
-
self.schema_metadata.
|
|
599
|
+
self.schema_metadata.model_dump(
|
|
609
600
|
exclude_none=True, exclude_unset=True
|
|
610
601
|
)
|
|
611
602
|
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Iterable, List, Optional, Union
|
|
4
|
+
from typing import Iterable, List, Optional, Type, Union
|
|
5
5
|
|
|
6
6
|
import yaml
|
|
7
7
|
from pydantic import Field, StrictStr, validator
|
|
@@ -48,7 +48,7 @@ VALID_ENTITY_TYPE_URNS = [
|
|
|
48
48
|
_VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def _validate_entity_type_urn(v: str) -> str:
|
|
51
|
+
def _validate_entity_type_urn(cls: Type, v: str) -> str:
|
|
52
52
|
urn = Urn.make_entity_type_urn(v)
|
|
53
53
|
if urn not in VALID_ENTITY_TYPE_URNS:
|
|
54
54
|
raise ValueError(
|
datahub/api/graphql/operation.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Any, Dict, List, Optional
|
|
3
3
|
|
|
4
|
-
from gql import
|
|
4
|
+
from gql import GraphQLRequest
|
|
5
5
|
|
|
6
6
|
from datahub.api.graphql.base import BaseApi
|
|
7
7
|
|
|
@@ -79,10 +79,12 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
|
|
|
79
79
|
if custom_properties is not None:
|
|
80
80
|
variable_values["customProperties"] = custom_properties
|
|
81
81
|
|
|
82
|
-
|
|
83
|
-
|
|
82
|
+
request = GraphQLRequest(
|
|
83
|
+
Operation.REPORT_OPERATION_MUTATION, variable_values=variable_values
|
|
84
84
|
)
|
|
85
85
|
|
|
86
|
+
result = self.client.execute(request)
|
|
87
|
+
|
|
86
88
|
return result["reportOperation"]
|
|
87
89
|
|
|
88
90
|
def query_operations(
|
|
@@ -109,12 +111,12 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
|
|
|
109
111
|
:param partition: The partition to check the operation.
|
|
110
112
|
"""
|
|
111
113
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
+
request = GraphQLRequest(
|
|
115
|
+
Operation.QUERY_OPERATIONS,
|
|
114
116
|
variable_values={
|
|
115
117
|
"urn": urn,
|
|
116
118
|
"startTimeMillis": start_time_millis,
|
|
117
|
-
"
|
|
119
|
+
"endTimeMillis": end_time_millis,
|
|
118
120
|
"limit": limit,
|
|
119
121
|
"filter": self.gen_filter(
|
|
120
122
|
{
|
|
@@ -125,6 +127,8 @@ mutation reportOperation($urn: String!, $sourceType: OperationSourceType!, $oper
|
|
|
125
127
|
),
|
|
126
128
|
},
|
|
127
129
|
)
|
|
130
|
+
|
|
131
|
+
result = self.client.execute(request)
|
|
128
132
|
if "dataset" in result and "operations" in result["dataset"]:
|
|
129
133
|
operations = []
|
|
130
134
|
if source_type is not None:
|
datahub/cli/docker_check.py
CHANGED
|
@@ -13,8 +13,8 @@ import yaml
|
|
|
13
13
|
from datahub.configuration.common import ExceptionWithProps
|
|
14
14
|
|
|
15
15
|
# Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
|
|
16
|
-
MIN_MEMORY_NEEDED =
|
|
17
|
-
MIN_DISK_SPACE_NEEDED =
|
|
16
|
+
MIN_MEMORY_NEEDED = 4 # GB
|
|
17
|
+
MIN_DISK_SPACE_NEEDED = 13 # GB
|
|
18
18
|
|
|
19
19
|
DOCKER_COMPOSE_PROJECT_NAME = os.getenv("DATAHUB_COMPOSE_PROJECT_NAME", "datahub")
|
|
20
20
|
DATAHUB_COMPOSE_PROJECT_FILTER = {
|
datahub/configuration/common.py
CHANGED
|
@@ -1,20 +1,25 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
import re
|
|
2
3
|
import unittest.mock
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
5
|
from enum import auto
|
|
5
6
|
from typing import (
|
|
6
7
|
IO,
|
|
8
|
+
TYPE_CHECKING,
|
|
9
|
+
Annotated,
|
|
7
10
|
Any,
|
|
8
11
|
ClassVar,
|
|
9
12
|
Dict,
|
|
10
13
|
List,
|
|
11
14
|
Optional,
|
|
12
15
|
Type,
|
|
16
|
+
TypeVar,
|
|
13
17
|
Union,
|
|
14
18
|
runtime_checkable,
|
|
15
19
|
)
|
|
16
20
|
|
|
17
21
|
import pydantic
|
|
22
|
+
import pydantic_core
|
|
18
23
|
from cached_property import cached_property
|
|
19
24
|
from pydantic import BaseModel, Extra, ValidationError
|
|
20
25
|
from pydantic.fields import Field
|
|
@@ -83,6 +88,29 @@ def redact_raw_config(obj: Any) -> Any:
|
|
|
83
88
|
return obj
|
|
84
89
|
|
|
85
90
|
|
|
91
|
+
if TYPE_CHECKING:
|
|
92
|
+
AnyType = TypeVar("AnyType")
|
|
93
|
+
HiddenFromDocs = Annotated[AnyType, ...]
|
|
94
|
+
else:
|
|
95
|
+
HiddenFromDocs = pydantic.json_schema.SkipJsonSchema
|
|
96
|
+
|
|
97
|
+
LaxStr = Annotated[str, pydantic.BeforeValidator(lambda v: str(v))]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclasses.dataclass(frozen=True)
|
|
101
|
+
class SupportedSources:
|
|
102
|
+
sources: List[str]
|
|
103
|
+
|
|
104
|
+
def __get_pydantic_json_schema__(
|
|
105
|
+
self,
|
|
106
|
+
core_schema: pydantic_core.core_schema.CoreSchema,
|
|
107
|
+
handler: pydantic.GetJsonSchemaHandler,
|
|
108
|
+
) -> pydantic.json_schema.JsonSchemaValue:
|
|
109
|
+
json_schema = handler(core_schema)
|
|
110
|
+
json_schema.setdefault("schema_extra", {})["supported_sources"] = self.sources
|
|
111
|
+
return json_schema
|
|
112
|
+
|
|
113
|
+
|
|
86
114
|
class ConfigModel(BaseModel):
|
|
87
115
|
class Config:
|
|
88
116
|
@staticmethod
|
|
@@ -334,4 +362,4 @@ class KeyValuePattern(ConfigModel):
|
|
|
334
362
|
|
|
335
363
|
|
|
336
364
|
class VersionedConfig(ConfigModel):
|
|
337
|
-
version:
|
|
365
|
+
version: LaxStr = "1"
|
|
@@ -1,13 +1,16 @@
|
|
|
1
|
-
from typing import Type
|
|
1
|
+
from typing import TYPE_CHECKING, Type
|
|
2
2
|
|
|
3
3
|
import pydantic
|
|
4
4
|
|
|
5
5
|
from datahub.ingestion.api.global_context import get_graph_context
|
|
6
6
|
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from pydantic.deprecated.class_validators import V1RootValidator
|
|
9
|
+
|
|
7
10
|
|
|
8
11
|
def auto_connection_resolver(
|
|
9
12
|
connection_field: str = "connection",
|
|
10
|
-
) ->
|
|
13
|
+
) -> "V1RootValidator":
|
|
11
14
|
def _resolve_connection(cls: Type, values: dict) -> dict:
|
|
12
15
|
if connection_field in values:
|
|
13
16
|
connection_urn = values.pop(connection_field)
|
|
@@ -1,15 +1,18 @@
|
|
|
1
|
-
from typing import TypeVar, Union
|
|
1
|
+
from typing import TYPE_CHECKING, Type, TypeVar, Union
|
|
2
2
|
|
|
3
3
|
import pydantic
|
|
4
4
|
|
|
5
5
|
from datahub.ingestion.api.registry import import_path
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from pydantic.deprecated.class_validators import V1Validator
|
|
8
9
|
|
|
10
|
+
_T = TypeVar("_T")
|
|
9
11
|
|
|
10
|
-
|
|
12
|
+
|
|
13
|
+
def _pydantic_resolver(cls: Type, v: Union[str, _T]) -> _T:
|
|
11
14
|
return import_path(v) if isinstance(v, str) else v
|
|
12
15
|
|
|
13
16
|
|
|
14
|
-
def pydantic_resolve_key(field: str) ->
|
|
17
|
+
def pydantic_resolve_key(field: str) -> "V1Validator":
|
|
15
18
|
return pydantic.validator(field, pre=True, allow_reuse=True)(_pydantic_resolver)
|
|
@@ -9,14 +9,6 @@ PYDANTIC_VERSION_2 = _pydantic_version >= Version("2.0")
|
|
|
9
9
|
# https://docs.pydantic.dev/latest/changelog/#v250-2023-11-13
|
|
10
10
|
PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR = _pydantic_version >= Version("2.5.0")
|
|
11
11
|
|
|
12
|
-
# This can be used to silence deprecation warnings while we migrate.
|
|
13
|
-
if PYDANTIC_VERSION_2:
|
|
14
|
-
from pydantic import PydanticDeprecatedSince20 # type: ignore
|
|
15
|
-
else:
|
|
16
|
-
|
|
17
|
-
class PydanticDeprecatedSince20(Warning): # type: ignore
|
|
18
|
-
pass
|
|
19
|
-
|
|
20
12
|
|
|
21
13
|
if PYDANTIC_VERSION_2:
|
|
22
14
|
from pydantic import BaseModel as GenericModel
|
|
@@ -52,7 +44,6 @@ class v1_ConfigModel(v1_BaseModel):
|
|
|
52
44
|
__all__ = [
|
|
53
45
|
"PYDANTIC_VERSION_2",
|
|
54
46
|
"PYDANTIC_SUPPORTS_CALLABLE_DISCRIMINATOR",
|
|
55
|
-
"PydanticDeprecatedSince20",
|
|
56
47
|
"GenericModel",
|
|
57
48
|
"v1_ConfigModel",
|
|
58
49
|
"v1_Field",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import Dict, Optional
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
import pydantic
|
|
4
4
|
from pydantic.fields import Field
|
|
5
5
|
|
|
6
6
|
from datahub.configuration.common import ConfigModel
|
|
@@ -30,7 +30,8 @@ class EnvConfigMixin(ConfigModel):
|
|
|
30
30
|
description="The environment that all assets produced by this connector belong to",
|
|
31
31
|
)
|
|
32
32
|
|
|
33
|
-
@
|
|
33
|
+
@pydantic.field_validator("env", mode="after")
|
|
34
|
+
@classmethod
|
|
34
35
|
def env_must_be_one_of(cls, v: str) -> str:
|
|
35
36
|
if v.upper() not in ALL_ENV_TYPES:
|
|
36
37
|
raise ValueError(f"env must be one of {ALL_ENV_TYPES}, found {v}")
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import warnings
|
|
2
|
-
from typing import Any, Optional, Type
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Optional, Type
|
|
3
3
|
|
|
4
4
|
import pydantic
|
|
5
5
|
|
|
6
6
|
from datahub.configuration.common import ConfigurationWarning
|
|
7
7
|
from datahub.utilities.global_warning_util import add_global_warning
|
|
8
8
|
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from pydantic.deprecated.class_validators import V1RootValidator
|
|
11
|
+
|
|
9
12
|
_unset = object()
|
|
10
13
|
|
|
11
14
|
|
|
@@ -13,7 +16,7 @@ def pydantic_field_deprecated(
|
|
|
13
16
|
field: str,
|
|
14
17
|
warn_if_value_is_not: Any = _unset,
|
|
15
18
|
message: Optional[str] = None,
|
|
16
|
-
) ->
|
|
19
|
+
) -> "V1RootValidator":
|
|
17
20
|
if message:
|
|
18
21
|
output = message
|
|
19
22
|
else:
|
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
import warnings
|
|
2
|
-
from typing import Type
|
|
2
|
+
from typing import TYPE_CHECKING, Type
|
|
3
3
|
|
|
4
4
|
import pydantic
|
|
5
5
|
|
|
6
6
|
from datahub.configuration.common import ConfigurationWarning
|
|
7
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from pydantic.deprecated.class_validators import V1RootValidator
|
|
10
|
+
|
|
8
11
|
|
|
9
12
|
def pydantic_removed_field(
|
|
10
13
|
field: str,
|
|
11
14
|
print_warning: bool = True,
|
|
12
|
-
) ->
|
|
15
|
+
) -> "V1RootValidator":
|
|
13
16
|
def _validate_field_removal(cls: Type, values: dict) -> dict:
|
|
14
17
|
if field in values:
|
|
15
18
|
if print_warning:
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
import warnings
|
|
2
|
-
from typing import Callable, Type, TypeVar
|
|
2
|
+
from typing import TYPE_CHECKING, Callable, Type, TypeVar
|
|
3
3
|
|
|
4
4
|
import pydantic
|
|
5
5
|
|
|
6
6
|
from datahub.configuration.common import ConfigurationWarning
|
|
7
7
|
from datahub.utilities.global_warning_util import add_global_warning
|
|
8
8
|
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from pydantic.deprecated.class_validators import V1RootValidator
|
|
11
|
+
|
|
9
12
|
_T = TypeVar("_T")
|
|
10
13
|
|
|
11
14
|
|
|
@@ -18,7 +21,7 @@ def pydantic_renamed_field(
|
|
|
18
21
|
new_name: str,
|
|
19
22
|
transform: Callable = _default_rename_transform,
|
|
20
23
|
print_warning: bool = True,
|
|
21
|
-
) ->
|
|
24
|
+
) -> "V1RootValidator":
|
|
22
25
|
def _validate_field_rename(cls: Type, values: dict) -> dict:
|
|
23
26
|
if old_name in values:
|
|
24
27
|
if new_name in values:
|
|
@@ -49,6 +52,4 @@ def pydantic_renamed_field(
|
|
|
49
52
|
# validator with pre=True gets all the values that were passed in.
|
|
50
53
|
# Given that a renamed field doesn't show up in the fields list, we can't use
|
|
51
54
|
# the field-level validator, even with a different field name.
|
|
52
|
-
return pydantic.root_validator(pre=True,
|
|
53
|
-
_validate_field_rename
|
|
54
|
-
)
|
|
55
|
+
return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename)
|
|
@@ -1,9 +1,12 @@
|
|
|
1
|
-
from typing import Optional, Type, Union
|
|
1
|
+
from typing import TYPE_CHECKING, Optional, Type, Union
|
|
2
2
|
|
|
3
3
|
import pydantic
|
|
4
4
|
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from pydantic.deprecated.class_validators import V1Validator
|
|
5
7
|
|
|
6
|
-
|
|
8
|
+
|
|
9
|
+
def pydantic_multiline_string(field: str) -> "V1Validator":
|
|
7
10
|
"""If the field is present and contains an escaped newline, replace it with a real newline.
|
|
8
11
|
|
|
9
12
|
This makes the assumption that the field value is never supposed to have a
|
|
@@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional
|
|
|
6
6
|
|
|
7
7
|
from pydantic import Field, validator
|
|
8
8
|
|
|
9
|
-
from datahub.configuration.common import ConfigModel, DynamicTypedConfig
|
|
9
|
+
from datahub.configuration.common import ConfigModel, DynamicTypedConfig, HiddenFromDocs
|
|
10
10
|
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
11
11
|
from datahub.ingestion.sink.file import FileSinkConfig
|
|
12
12
|
|
|
@@ -85,7 +85,7 @@ class PipelineConfig(ConfigModel):
|
|
|
85
85
|
source: SourceConfig
|
|
86
86
|
sink: Optional[DynamicTypedConfig] = None
|
|
87
87
|
transformers: Optional[List[DynamicTypedConfig]] = None
|
|
88
|
-
flags: FlagsConfig =
|
|
88
|
+
flags: HiddenFromDocs[FlagsConfig] = FlagsConfig()
|
|
89
89
|
reporting: List[ReporterConfig] = []
|
|
90
90
|
run_id: str = DEFAULT_RUN_ID
|
|
91
91
|
datahub_api: Optional[DatahubClientConfig] = None
|
|
@@ -81,7 +81,7 @@ class AzureConnectionConfig(ConfigModel):
|
|
|
81
81
|
)
|
|
82
82
|
return self.sas_token if self.sas_token is not None else self.account_key
|
|
83
83
|
|
|
84
|
-
@root_validator()
|
|
84
|
+
@root_validator(skip_on_failure=True)
|
|
85
85
|
def _check_credential_values(cls, values: Dict) -> Dict:
|
|
86
86
|
if (
|
|
87
87
|
values.get("account_key")
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
import re
|
|
4
|
+
from copy import deepcopy
|
|
4
5
|
from datetime import timedelta
|
|
5
6
|
from typing import Dict, List, Optional, Union
|
|
6
7
|
|
|
7
8
|
from pydantic import Field, PositiveInt, PrivateAttr, root_validator, validator
|
|
8
9
|
|
|
9
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
10
11
|
from datahub.configuration.source_common import (
|
|
11
12
|
EnvConfigMixin,
|
|
12
13
|
LowerCaseDatasetUrnConfigMixin,
|
|
@@ -73,8 +74,10 @@ class BigQueryBaseConfig(ConfigModel):
|
|
|
73
74
|
) from e
|
|
74
75
|
return v
|
|
75
76
|
|
|
76
|
-
@root_validator(pre=True
|
|
77
|
+
@root_validator(pre=True)
|
|
77
78
|
def project_id_backward_compatibility_configs_set(cls, values: Dict) -> Dict:
|
|
79
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
80
|
+
values = deepcopy(values)
|
|
78
81
|
project_id = values.pop("project_id", None)
|
|
79
82
|
project_ids = values.get("project_ids")
|
|
80
83
|
|
|
@@ -182,13 +185,14 @@ class BigQueryFilterConfig(SQLFilterConfig):
|
|
|
182
185
|
)
|
|
183
186
|
|
|
184
187
|
# NOTE: `schema_pattern` is added here only to hide it from docs.
|
|
185
|
-
schema_pattern: AllowDenyPattern = Field(
|
|
188
|
+
schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
|
|
186
189
|
default=AllowDenyPattern.allow_all(),
|
|
187
|
-
hidden_from_docs=True,
|
|
188
190
|
)
|
|
189
191
|
|
|
190
192
|
@root_validator(pre=False, skip_on_failure=True)
|
|
191
193
|
def backward_compatibility_configs_set(cls, values: Dict) -> Dict:
|
|
194
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
195
|
+
values = deepcopy(values)
|
|
192
196
|
dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern")
|
|
193
197
|
schema_pattern = values.get("schema_pattern")
|
|
194
198
|
if (
|
|
@@ -320,8 +324,7 @@ class BigQueryV2Config(
|
|
|
320
324
|
description="Include full payload into events. It is only for debugging and internal use.",
|
|
321
325
|
)
|
|
322
326
|
|
|
323
|
-
number_of_datasets_process_in_batch: int = Field(
|
|
324
|
-
hidden_from_docs=True,
|
|
327
|
+
number_of_datasets_process_in_batch: HiddenFromDocs[int] = Field(
|
|
325
328
|
default=10000,
|
|
326
329
|
description="Number of table queried in batch when getting metadata. This is a low level config property "
|
|
327
330
|
"which should be touched with care.",
|
|
@@ -436,17 +439,15 @@ class BigQueryV2Config(
|
|
|
436
439
|
|
|
437
440
|
upstream_lineage_in_report: bool = Field(
|
|
438
441
|
default=False,
|
|
439
|
-
description="Useful for debugging lineage information. Set to True to see the raw lineage created internally.",
|
|
442
|
+
description="Useful for debugging lineage information. Set to True to see the raw lineage created internally. Only works with legacy approach (`use_queries_v2: False`).",
|
|
440
443
|
)
|
|
441
444
|
|
|
442
|
-
run_optimized_column_query: bool = Field(
|
|
443
|
-
hidden_from_docs=True,
|
|
445
|
+
run_optimized_column_query: HiddenFromDocs[bool] = Field(
|
|
444
446
|
default=False,
|
|
445
447
|
description="Run optimized column query to get column information. This is an experimental feature and may not work for all cases.",
|
|
446
448
|
)
|
|
447
449
|
|
|
448
|
-
file_backed_cache_size: int = Field(
|
|
449
|
-
hidden_from_docs=True,
|
|
450
|
+
file_backed_cache_size: HiddenFromDocs[int] = Field(
|
|
450
451
|
default=2000,
|
|
451
452
|
description="Maximum number of entries for the in-memory caches of FileBacked data structures.",
|
|
452
453
|
)
|
|
@@ -456,10 +457,9 @@ class BigQueryV2Config(
|
|
|
456
457
|
description="Option to exclude empty projects from being ingested.",
|
|
457
458
|
)
|
|
458
459
|
|
|
459
|
-
schema_resolution_batch_size: int = Field(
|
|
460
|
+
schema_resolution_batch_size: HiddenFromDocs[int] = Field(
|
|
460
461
|
default=100,
|
|
461
462
|
description="The number of tables to process in a batch when resolving schema from DataHub.",
|
|
462
|
-
hidden_from_schema=True,
|
|
463
463
|
)
|
|
464
464
|
|
|
465
465
|
max_threads_dataset_parallelism: int = Field(
|
|
@@ -480,6 +480,8 @@ class BigQueryV2Config(
|
|
|
480
480
|
|
|
481
481
|
@root_validator(pre=True)
|
|
482
482
|
def set_include_schema_metadata(cls, values: Dict) -> Dict:
|
|
483
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
484
|
+
values = deepcopy(values)
|
|
483
485
|
# Historically this is used to disable schema ingestion
|
|
484
486
|
if (
|
|
485
487
|
"include_tables" in values
|
|
@@ -498,6 +500,8 @@ class BigQueryV2Config(
|
|
|
498
500
|
|
|
499
501
|
@root_validator(skip_on_failure=True)
|
|
500
502
|
def profile_default_settings(cls, values: Dict) -> Dict:
|
|
503
|
+
# Create a copy to avoid modifying the input dictionary, preventing state contamination in tests
|
|
504
|
+
values = deepcopy(values)
|
|
501
505
|
# Extra default SQLAlchemy option for better connection pooling and threading.
|
|
502
506
|
# https://docs.sqlalchemy.org/en/14/core/pooling.html#sqlalchemy.pool.QueuePool.params.max_overflow
|
|
503
507
|
values["options"].setdefault("max_overflow", -1)
|
|
@@ -515,9 +519,19 @@ class BigQueryV2Config(
|
|
|
515
519
|
|
|
516
520
|
return v
|
|
517
521
|
|
|
522
|
+
@validator("upstream_lineage_in_report")
|
|
523
|
+
def validate_upstream_lineage_in_report(cls, v: bool, values: Dict) -> bool:
|
|
524
|
+
if v and values.get("use_queries_v2", True):
|
|
525
|
+
logging.warning(
|
|
526
|
+
"`upstream_lineage_in_report` is enabled but will be ignored because `use_queries_v2` is enabled."
|
|
527
|
+
"This debugging feature only works with the legacy lineage approach (`use_queries_v2: false`)."
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
return v
|
|
531
|
+
|
|
518
532
|
def get_table_pattern(self, pattern: List[str]) -> str:
|
|
519
533
|
return "|".join(pattern) if pattern else ""
|
|
520
534
|
|
|
521
|
-
|
|
535
|
+
_platform_instance_not_supported_for_bigquery = pydantic_removed_field(
|
|
522
536
|
"platform_instance"
|
|
523
537
|
)
|
|
@@ -8,7 +8,7 @@ from typing import Collection, Dict, Iterable, List, Optional, TypedDict
|
|
|
8
8
|
from google.cloud.bigquery import Client
|
|
9
9
|
from pydantic import Field, PositiveInt
|
|
10
10
|
|
|
11
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
11
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
12
12
|
from datahub.configuration.time_window_config import (
|
|
13
13
|
BaseTimeWindowConfig,
|
|
14
14
|
get_time_bucket,
|
|
@@ -86,12 +86,11 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
|
|
|
86
86
|
# TODO: Support stateful ingestion for the time windows.
|
|
87
87
|
window: BaseTimeWindowConfig = BaseTimeWindowConfig()
|
|
88
88
|
|
|
89
|
-
local_temp_path: Optional[pathlib.Path] = Field(
|
|
90
|
-
default=None,
|
|
91
|
-
description="Local path to store the audit log.",
|
|
89
|
+
local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = Field(
|
|
92
90
|
# TODO: For now, this is simply an advanced config to make local testing easier.
|
|
93
91
|
# Eventually, we will want to store date-specific files in the directory and use it as a cache.
|
|
94
|
-
|
|
92
|
+
default=None,
|
|
93
|
+
description="Local path to store the audit log.",
|
|
95
94
|
)
|
|
96
95
|
|
|
97
96
|
user_email_pattern: AllowDenyPattern = Field(
|
|
@@ -9,7 +9,9 @@ from datahub.configuration.validate_multiline_string import pydantic_multiline_s
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class GCPCredential(ConfigModel):
|
|
12
|
-
project_id: Optional[str] = Field(
|
|
12
|
+
project_id: Optional[str] = Field(
|
|
13
|
+
None, description="Project id to set the credentials"
|
|
14
|
+
)
|
|
13
15
|
private_key_id: str = Field(description="Private key id")
|
|
14
16
|
private_key: str = Field(
|
|
15
17
|
description="Private key in a form of '-----BEGIN PRIVATE KEY-----\\nprivate-key\\n-----END PRIVATE KEY-----\\n'"
|