acryl-datahub 1.2.0.9rc1__py3-none-any.whl → 1.2.0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/METADATA +2568 -2626
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/RECORD +120 -113
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +6 -3
- datahub/api/entities/dataset/dataset.py +9 -18
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/docker_check.py +2 -2
- datahub/configuration/common.py +29 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/pydantic_migration_helpers.py +0 -9
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +5 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/ingestion/autogenerated/capability_summary.json +45 -1
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/source/azure/azure_common.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +23 -16
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -0
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
- datahub/ingestion/source/datahub/config.py +8 -9
- datahub/ingestion/source/dbt/dbt_common.py +65 -5
- datahub/ingestion/source/delta_lake/config.py +1 -1
- datahub/ingestion/source/dremio/dremio_config.py +3 -4
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +19 -2
- datahub/ingestion/source/ge_data_profiler.py +15 -2
- datahub/ingestion/source/ge_profiling_config.py +26 -22
- datahub/ingestion/source/grafana/grafana_config.py +2 -2
- datahub/ingestion/source/grafana/models.py +12 -14
- datahub/ingestion/source/hex/hex.py +6 -1
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/looker/looker_common.py +76 -75
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_source.py +493 -547
- datahub/ingestion/source/looker/lookml_config.py +1 -1
- datahub/ingestion/source/looker/lookml_source.py +46 -88
- datahub/ingestion/source/metabase.py +9 -2
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +1 -1
- datahub/ingestion/source/mode.py +13 -5
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +14 -21
- datahub/ingestion/source/preset.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/redash.py +1 -1
- datahub/ingestion/source/redshift/config.py +6 -3
- datahub/ingestion/source/redshift/query.py +23 -19
- datahub/ingestion/source/s3/source.py +26 -24
- datahub/ingestion/source/salesforce.py +13 -9
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/snaplogic/__init__.py +0 -0
- datahub/ingestion/source/snaplogic/snaplogic.py +355 -0
- datahub/ingestion/source/snaplogic/snaplogic_config.py +37 -0
- datahub/ingestion/source/snaplogic/snaplogic_lineage_extractor.py +107 -0
- datahub/ingestion/source/snaplogic/snaplogic_parser.py +168 -0
- datahub/ingestion/source/snaplogic/snaplogic_utils.py +31 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
- datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +12 -7
- datahub/ingestion/source/sql/cockroachdb.py +5 -3
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +7 -9
- datahub/ingestion/source/sql/mssql/source.py +2 -2
- datahub/ingestion/source/sql/mysql.py +2 -2
- datahub/ingestion/source/sql/oracle.py +3 -3
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/teradata.py +4 -4
- datahub/ingestion/source/sql/trino.py +2 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +1 -1
- datahub/ingestion/source/sql_queries.py +6 -6
- datahub/ingestion/source/state/checkpoint.py +5 -1
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
- datahub/ingestion/source/superset.py +122 -15
- datahub/ingestion/source/tableau/tableau.py +68 -14
- datahub/ingestion/source/tableau/tableau_common.py +5 -0
- datahub/ingestion/source/tableau/tableau_constant.py +1 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +7 -3
- datahub/ingestion/source/usage/usage_common.py +3 -3
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/set_browse_path.py +112 -0
- datahub/metadata/_internal_schema_classes.py +728 -528
- datahub/metadata/_urns/urn_defs.py +1702 -1702
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/schema.avsc +17434 -17732
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +72 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
- datahub/metadata/schemas/LogicalParent.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
- datahub/sdk/_shared.py +126 -0
- datahub/sdk/chart.py +87 -30
- datahub/sdk/dashboard.py +79 -34
- datahub/sdk/entity_client.py +11 -4
- datahub/sdk/lineage_client.py +3 -3
- datahub/sdk/search_filters.py +1 -7
- datahub/sql_parsing/split_statements.py +13 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.9rc1.dist-info → acryl_datahub-1.2.0.10.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ from typing import Dict, List, Optional, Set
|
|
|
7
7
|
import pydantic
|
|
8
8
|
from pydantic import Field, root_validator, validator
|
|
9
9
|
|
|
10
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
11
11
|
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
12
12
|
from datahub.configuration.source_common import (
|
|
13
13
|
EnvConfigMixin,
|
|
@@ -67,13 +67,10 @@ class TagOption(StrEnum):
|
|
|
67
67
|
|
|
68
68
|
@dataclass(frozen=True)
|
|
69
69
|
class DatabaseId:
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
platform_instance: Optional[str] =
|
|
74
|
-
default=None,
|
|
75
|
-
description="Platform instance of consumer snowflake account.",
|
|
76
|
-
)
|
|
70
|
+
# Database created from share in consumer account
|
|
71
|
+
database: str
|
|
72
|
+
# Platform instance of consumer snowflake account
|
|
73
|
+
platform_instance: Optional[str] = None
|
|
77
74
|
|
|
78
75
|
|
|
79
76
|
class SnowflakeShareConfig(ConfigModel):
|
|
@@ -282,10 +279,11 @@ class SnowflakeV2Config(
|
|
|
282
279
|
description="If enabled along with `extract_tags`, extracts snowflake's key-value tags as DataHub structured properties instead of DataHub tags.",
|
|
283
280
|
)
|
|
284
281
|
|
|
285
|
-
structured_properties_template_cache_invalidation_interval: int =
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
282
|
+
structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
|
|
283
|
+
Field(
|
|
284
|
+
default=60,
|
|
285
|
+
description="Interval in seconds to invalidate the structured properties template cache.",
|
|
286
|
+
)
|
|
289
287
|
)
|
|
290
288
|
|
|
291
289
|
include_external_url: bool = Field(
|
|
@@ -334,7 +332,7 @@ class SnowflakeV2Config(
|
|
|
334
332
|
"to ignore the temporary staging tables created by known ETL tools.",
|
|
335
333
|
)
|
|
336
334
|
|
|
337
|
-
rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field(
|
|
335
|
+
rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field( # type: ignore[pydantic-field]
|
|
338
336
|
"upstreams_deny_pattern", "temporary_tables_pattern"
|
|
339
337
|
)
|
|
340
338
|
|
|
@@ -352,8 +350,7 @@ class SnowflakeV2Config(
|
|
|
352
350
|
)
|
|
353
351
|
|
|
354
352
|
# Allows empty containers to be ingested before datasets are added, avoiding permission errors
|
|
355
|
-
warn_no_datasets: bool = Field(
|
|
356
|
-
hidden_from_docs=True,
|
|
353
|
+
warn_no_datasets: HiddenFromDocs[bool] = Field(
|
|
357
354
|
default=False,
|
|
358
355
|
description="If True, warns when no datasets are found during ingestion. If False, ingestion fails when no datasets are found.",
|
|
359
356
|
)
|
|
@@ -15,7 +15,12 @@ from snowflake.connector.network import (
|
|
|
15
15
|
OAUTH_AUTHENTICATOR,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
from datahub.configuration.common import
|
|
18
|
+
from datahub.configuration.common import (
|
|
19
|
+
ConfigModel,
|
|
20
|
+
ConfigurationError,
|
|
21
|
+
HiddenFromDocs,
|
|
22
|
+
MetaError,
|
|
23
|
+
)
|
|
19
24
|
from datahub.configuration.connection_resolver import auto_connection_resolver
|
|
20
25
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
21
26
|
from datahub.ingestion.api.closeable import Closeable
|
|
@@ -63,7 +68,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
63
68
|
description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.",
|
|
64
69
|
)
|
|
65
70
|
|
|
66
|
-
scheme: str = "snowflake"
|
|
71
|
+
scheme: HiddenFromDocs[str] = "snowflake"
|
|
67
72
|
username: Optional[str] = pydantic.Field(
|
|
68
73
|
default=None, description="Snowflake username."
|
|
69
74
|
)
|
|
@@ -118,7 +123,7 @@ class SnowflakeConnectionConfig(ConfigModel):
|
|
|
118
123
|
assert self.account_id
|
|
119
124
|
return self.account_id
|
|
120
125
|
|
|
121
|
-
rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id")
|
|
126
|
+
rename_host_port_to_account_id = pydantic_renamed_field("host_port", "account_id") # type: ignore[pydantic-field]
|
|
122
127
|
|
|
123
128
|
@pydantic.validator("account_id")
|
|
124
129
|
def validate_account_id(cls, account_id: str, values: Dict) -> str:
|
|
@@ -2,7 +2,17 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from datetime import datetime
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import (
|
|
6
|
+
TYPE_CHECKING,
|
|
7
|
+
Any,
|
|
8
|
+
Collection,
|
|
9
|
+
Iterable,
|
|
10
|
+
List,
|
|
11
|
+
Optional,
|
|
12
|
+
Set,
|
|
13
|
+
Tuple,
|
|
14
|
+
Type,
|
|
15
|
+
)
|
|
6
16
|
|
|
7
17
|
from pydantic import BaseModel, Field, validator
|
|
8
18
|
|
|
@@ -44,6 +54,9 @@ from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
|
|
|
44
54
|
from datahub.utilities.perf_timer import PerfTimer
|
|
45
55
|
from datahub.utilities.time import ts_millis_to_datetime
|
|
46
56
|
|
|
57
|
+
if TYPE_CHECKING:
|
|
58
|
+
from pydantic.deprecated.class_validators import V1Validator
|
|
59
|
+
|
|
47
60
|
logger: logging.Logger = logging.getLogger(__name__)
|
|
48
61
|
|
|
49
62
|
EXTERNAL_LINEAGE = "external_lineage"
|
|
@@ -51,7 +64,7 @@ TABLE_LINEAGE = "table_lineage"
|
|
|
51
64
|
VIEW_LINEAGE = "view_lineage"
|
|
52
65
|
|
|
53
66
|
|
|
54
|
-
def pydantic_parse_json(field: str) ->
|
|
67
|
+
def pydantic_parse_json(field: str) -> "V1Validator":
|
|
55
68
|
def _parse_from_json(cls: Type, v: Any) -> dict:
|
|
56
69
|
if isinstance(v, str):
|
|
57
70
|
return json.loads(v)
|
|
@@ -13,7 +13,7 @@ from typing import Any, Dict, Iterable, List, Optional, Union
|
|
|
13
13
|
import pydantic
|
|
14
14
|
from typing_extensions import Self
|
|
15
15
|
|
|
16
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
16
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
17
17
|
from datahub.configuration.time_window_config import (
|
|
18
18
|
BaseTimeWindowConfig,
|
|
19
19
|
BucketDuration,
|
|
@@ -112,12 +112,11 @@ class SnowflakeQueriesExtractorConfig(ConfigModel):
|
|
|
112
112
|
"to ignore the temporary staging tables created by known ETL tools.",
|
|
113
113
|
)
|
|
114
114
|
|
|
115
|
-
local_temp_path: Optional[pathlib.Path] = pydantic.Field(
|
|
116
|
-
default=None,
|
|
117
|
-
description="Local path to store the audit log.",
|
|
115
|
+
local_temp_path: HiddenFromDocs[Optional[pathlib.Path]] = pydantic.Field(
|
|
118
116
|
# TODO: For now, this is simply an advanced config to make local testing easier.
|
|
119
117
|
# Eventually, we will want to store date-specific files in the directory and use it as a cache.
|
|
120
|
-
|
|
118
|
+
default=None,
|
|
119
|
+
description="Local path to store the audit log.",
|
|
121
120
|
)
|
|
122
121
|
|
|
123
122
|
include_lineage: bool = True
|
|
@@ -16,6 +16,7 @@ from sqlalchemy.engine.reflection import Inspector
|
|
|
16
16
|
from sqlalchemy.types import TypeEngine
|
|
17
17
|
from sqlalchemy_bigquery import STRUCT
|
|
18
18
|
|
|
19
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
19
20
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
20
21
|
from datahub.emitter.mcp_builder import ContainerKey, DatabaseKey
|
|
21
22
|
from datahub.ingestion.api.decorators import (
|
|
@@ -251,7 +252,7 @@ class CustomAthenaRestDialect(AthenaRestDialect):
|
|
|
251
252
|
|
|
252
253
|
|
|
253
254
|
class AthenaConfig(SQLCommonConfig):
|
|
254
|
-
scheme: str = "awsathena+rest"
|
|
255
|
+
scheme: HiddenFromDocs[str] = "awsathena+rest"
|
|
255
256
|
username: Optional[str] = pydantic.Field(
|
|
256
257
|
default=None,
|
|
257
258
|
description="Username credential. If not specified, detected with boto3 rules. See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html",
|
|
@@ -18,6 +18,7 @@ from sqlalchemy.sql import sqltypes
|
|
|
18
18
|
from sqlalchemy.types import BOOLEAN, DATE, DATETIME, INTEGER
|
|
19
19
|
|
|
20
20
|
import datahub.emitter.mce_builder as builder
|
|
21
|
+
from datahub.configuration.common import HiddenFromDocs, LaxStr
|
|
21
22
|
from datahub.configuration.source_common import DatasetLineageProviderConfigBase
|
|
22
23
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
23
24
|
from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
|
|
@@ -128,16 +129,20 @@ class ClickHouseConfig(
|
|
|
128
129
|
):
|
|
129
130
|
# defaults
|
|
130
131
|
host_port: str = Field(default="localhost:8123", description="ClickHouse host URL.")
|
|
131
|
-
scheme: str = Field(default="clickhouse"
|
|
132
|
+
scheme: HiddenFromDocs[str] = Field(default="clickhouse")
|
|
132
133
|
password: pydantic.SecretStr = Field(
|
|
133
134
|
default=pydantic.SecretStr(""), description="password"
|
|
134
135
|
)
|
|
135
|
-
secure: Optional[bool] = Field(
|
|
136
|
-
|
|
136
|
+
secure: Optional[bool] = Field(
|
|
137
|
+
default=None, description="[deprecated] Use uri_opts instead."
|
|
138
|
+
)
|
|
139
|
+
protocol: Optional[str] = Field(
|
|
140
|
+
default=None, description="[deprecated] Use uri_opts instead."
|
|
141
|
+
)
|
|
137
142
|
_deprecate_secure = pydantic_field_deprecated("secure")
|
|
138
143
|
_deprecate_protocol = pydantic_field_deprecated("protocol")
|
|
139
144
|
|
|
140
|
-
uri_opts: Dict[str,
|
|
145
|
+
uri_opts: Dict[str, LaxStr] = Field(
|
|
141
146
|
default={},
|
|
142
147
|
description="The part of the URI and it's used to provide additional configuration options or parameters for the database connection.",
|
|
143
148
|
)
|
|
@@ -185,9 +190,9 @@ class ClickHouseConfig(
|
|
|
185
190
|
"Initializing uri_opts from deprecated secure or protocol options"
|
|
186
191
|
)
|
|
187
192
|
values["uri_opts"] = {}
|
|
188
|
-
if secure:
|
|
189
|
-
values["uri_opts"]["secure"] = secure
|
|
190
|
-
if protocol:
|
|
193
|
+
if secure is not None:
|
|
194
|
+
values["uri_opts"]["secure"] = str(secure)
|
|
195
|
+
if protocol is not None:
|
|
191
196
|
values["uri_opts"]["protocol"] = protocol
|
|
192
197
|
logger.debug(f"uri_opts: {uri_opts}")
|
|
193
198
|
elif (secure or protocol) and uri_opts:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from pydantic.fields import Field
|
|
2
2
|
|
|
3
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
3
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
4
4
|
from datahub.ingestion.api.common import PipelineContext
|
|
5
5
|
from datahub.ingestion.api.decorators import (
|
|
6
6
|
SourceCapability,
|
|
@@ -14,8 +14,10 @@ from datahub.ingestion.source.sql.postgres import PostgresConfig, PostgresSource
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class CockroachDBConfig(PostgresConfig):
|
|
17
|
-
scheme = Field(
|
|
18
|
-
|
|
17
|
+
scheme: HiddenFromDocs[str] = Field(
|
|
18
|
+
default="cockroachdb+psycopg2", description="database scheme"
|
|
19
|
+
)
|
|
20
|
+
schema_pattern: AllowDenyPattern = Field(
|
|
19
21
|
default=AllowDenyPattern(deny=["information_schema", "crdb_internal"])
|
|
20
22
|
)
|
|
21
23
|
|
|
@@ -6,7 +6,7 @@ from pydantic.fields import Field
|
|
|
6
6
|
from pydruid.db.sqlalchemy import DruidDialect
|
|
7
7
|
from sqlalchemy.exc import ResourceClosedError
|
|
8
8
|
|
|
9
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
9
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
10
10
|
from datahub.ingestion.api.decorators import (
|
|
11
11
|
SourceCapability,
|
|
12
12
|
SupportStatus,
|
|
@@ -34,7 +34,7 @@ DruidDialect.get_table_names = get_table_names
|
|
|
34
34
|
|
|
35
35
|
class DruidConfig(BasicSQLAlchemyConfig):
|
|
36
36
|
# defaults
|
|
37
|
-
scheme: str = "druid"
|
|
37
|
+
scheme: HiddenFromDocs[str] = "druid"
|
|
38
38
|
schema_pattern: AllowDenyPattern = Field(
|
|
39
39
|
default=AllowDenyPattern(deny=["^(lookup|sysgit|view).*"]),
|
|
40
40
|
description="regex patterns for schemas to filter in ingestion.",
|
|
@@ -6,7 +6,7 @@ from enum import Enum
|
|
|
6
6
|
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
|
-
from pydantic
|
|
9
|
+
from pydantic import validator
|
|
10
10
|
from pydantic.fields import Field
|
|
11
11
|
|
|
12
12
|
# This import verifies that the dependencies are available.
|
|
@@ -14,6 +14,7 @@ from pyhive import hive # noqa: F401
|
|
|
14
14
|
from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveDialect, HiveTimestamp
|
|
15
15
|
from sqlalchemy.engine.reflection import Inspector
|
|
16
16
|
|
|
17
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
17
18
|
from datahub.emitter.mce_builder import (
|
|
18
19
|
make_data_platform_urn,
|
|
19
20
|
make_dataplatform_instance_urn,
|
|
@@ -651,10 +652,10 @@ HiveDialect.get_view_definition = get_view_definition_patched
|
|
|
651
652
|
|
|
652
653
|
class HiveConfig(TwoTierSQLAlchemyConfig):
|
|
653
654
|
# defaults
|
|
654
|
-
scheme: str = Field(default="hive"
|
|
655
|
+
scheme: HiddenFromDocs[str] = Field(default="hive")
|
|
655
656
|
|
|
656
657
|
# Overriding as table location lineage is richer implementation here than with include_table_location_lineage
|
|
657
|
-
include_table_location_lineage: bool = Field(default=False
|
|
658
|
+
include_table_location_lineage: HiddenFromDocs[bool] = Field(default=False)
|
|
658
659
|
|
|
659
660
|
emit_storage_lineage: bool = Field(
|
|
660
661
|
default=False,
|
|
@@ -1,17 +1,15 @@
|
|
|
1
1
|
import base64
|
|
2
|
+
import dataclasses
|
|
2
3
|
import json
|
|
3
4
|
import logging
|
|
4
5
|
from collections import namedtuple
|
|
5
6
|
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
|
6
7
|
|
|
7
|
-
from pydantic
|
|
8
|
-
from pydantic.fields import Field
|
|
9
|
-
|
|
10
|
-
# This import verifies that the dependencies are available.
|
|
8
|
+
from pydantic import Field
|
|
11
9
|
from sqlalchemy import create_engine, text
|
|
12
10
|
from sqlalchemy.engine.reflection import Inspector
|
|
13
11
|
|
|
14
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
12
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
15
13
|
from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
|
|
16
14
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
17
15
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -73,7 +71,7 @@ class HiveMetastoreConfigMode(StrEnum):
|
|
|
73
71
|
trino = "trino"
|
|
74
72
|
|
|
75
73
|
|
|
76
|
-
@dataclass
|
|
74
|
+
@dataclasses.dataclass
|
|
77
75
|
class ViewDataset:
|
|
78
76
|
dataset_name: str
|
|
79
77
|
schema_name: str
|
|
@@ -99,7 +97,7 @@ class HiveMetastore(BasicSQLAlchemyConfig):
|
|
|
99
97
|
default="localhost:3306",
|
|
100
98
|
description="Host URL and port to connect to. Example: localhost:3306",
|
|
101
99
|
)
|
|
102
|
-
scheme: str = Field(default="mysql+pymysql"
|
|
100
|
+
scheme: HiddenFromDocs[str] = Field(default="mysql+pymysql")
|
|
103
101
|
|
|
104
102
|
database_pattern: AllowDenyPattern = Field(
|
|
105
103
|
default=AllowDenyPattern.allow_all(),
|
|
@@ -123,8 +121,8 @@ class HiveMetastore(BasicSQLAlchemyConfig):
|
|
|
123
121
|
description="Dataset Subtype name to be 'Table' or 'View' Valid options: ['True', 'False']",
|
|
124
122
|
)
|
|
125
123
|
|
|
126
|
-
include_view_lineage: bool = Field(
|
|
127
|
-
default=False,
|
|
124
|
+
include_view_lineage: HiddenFromDocs[bool] = Field(
|
|
125
|
+
default=False,
|
|
128
126
|
)
|
|
129
127
|
|
|
130
128
|
include_catalog_name_in_ids: bool = Field(
|
|
@@ -13,7 +13,7 @@ from sqlalchemy.exc import ProgrammingError, ResourceClosedError
|
|
|
13
13
|
from sqlalchemy.sql import quoted_name
|
|
14
14
|
|
|
15
15
|
import datahub.metadata.schema_classes as models
|
|
16
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
16
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
17
17
|
from datahub.configuration.pattern_utils import UUID_REGEX
|
|
18
18
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
19
19
|
from datahub.ingestion.api.common import PipelineContext
|
|
@@ -75,7 +75,7 @@ DEFAULT_TEMP_TABLES_PATTERNS = [
|
|
|
75
75
|
class SQLServerConfig(BasicSQLAlchemyConfig):
|
|
76
76
|
# defaults
|
|
77
77
|
host_port: str = Field(default="localhost:1433", description="MSSQL host URL.")
|
|
78
|
-
scheme: str = Field(default="mssql+pytds"
|
|
78
|
+
scheme: HiddenFromDocs[str] = Field(default="mssql+pytds")
|
|
79
79
|
|
|
80
80
|
# TODO: rename to include_procedures ?
|
|
81
81
|
include_stored_procedures: bool = Field(
|
|
@@ -9,7 +9,7 @@ from sqlalchemy.dialects.mysql import BIT, base
|
|
|
9
9
|
from sqlalchemy.dialects.mysql.enumerated import SET
|
|
10
10
|
from sqlalchemy.engine.reflection import Inspector
|
|
11
11
|
|
|
12
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
12
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
13
13
|
from datahub.ingestion.api.decorators import (
|
|
14
14
|
SourceCapability,
|
|
15
15
|
SupportStatus,
|
|
@@ -57,7 +57,7 @@ base.ischema_names["decimal128"] = DECIMAL128
|
|
|
57
57
|
class MySQLConnectionConfig(SQLAlchemyConnectionConfig):
|
|
58
58
|
# defaults
|
|
59
59
|
host_port: str = Field(default="localhost:3306", description="MySQL host URL.")
|
|
60
|
-
scheme: str = "mysql+pymysql"
|
|
60
|
+
scheme: HiddenFromDocs[str] = "mysql+pymysql"
|
|
61
61
|
|
|
62
62
|
|
|
63
63
|
class MySQLConfig(MySQLConnectionConfig, TwoTierSQLAlchemyConfig):
|
|
@@ -110,10 +110,10 @@ class OracleConfig(BasicSQLAlchemyConfig):
|
|
|
110
110
|
return v
|
|
111
111
|
|
|
112
112
|
@pydantic.validator("data_dictionary_mode")
|
|
113
|
-
def check_data_dictionary_mode(cls,
|
|
114
|
-
if
|
|
113
|
+
def check_data_dictionary_mode(cls, value):
|
|
114
|
+
if value not in ("ALL", "DBA"):
|
|
115
115
|
raise ValueError("Specify one of data dictionary views mode: 'ALL', 'DBA'.")
|
|
116
|
-
return
|
|
116
|
+
return value
|
|
117
117
|
|
|
118
118
|
@pydantic.validator("thick_mode_lib_dir", always=True)
|
|
119
119
|
def check_thick_mode_lib_dir(cls, v, values):
|
|
@@ -8,6 +8,7 @@ from sqlalchemy import exc, sql
|
|
|
8
8
|
from sqlalchemy.engine import reflection
|
|
9
9
|
from sqlalchemy.engine.base import Engine
|
|
10
10
|
|
|
11
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
11
12
|
from datahub.ingestion.api.common import PipelineContext
|
|
12
13
|
from datahub.ingestion.api.decorators import (
|
|
13
14
|
SourceCapability,
|
|
@@ -87,7 +88,7 @@ PrestoDialect._get_full_table = _get_full_table
|
|
|
87
88
|
|
|
88
89
|
class PrestoConfig(TrinoConfig):
|
|
89
90
|
# defaults
|
|
90
|
-
scheme: str = Field(default="presto"
|
|
91
|
+
scheme: HiddenFromDocs[str] = Field(default="presto")
|
|
91
92
|
|
|
92
93
|
|
|
93
94
|
@platform_name("Presto", doc_order=1)
|
|
@@ -468,23 +468,23 @@ class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig):
|
|
|
468
468
|
),
|
|
469
469
|
)
|
|
470
470
|
|
|
471
|
-
database_pattern = Field(
|
|
471
|
+
database_pattern: AllowDenyPattern = Field(
|
|
472
472
|
default=AllowDenyPattern(deny=EXCLUDED_DATABASES),
|
|
473
473
|
description="Regex patterns for databases to filter in ingestion.",
|
|
474
474
|
)
|
|
475
|
-
include_table_lineage = Field(
|
|
475
|
+
include_table_lineage: bool = Field(
|
|
476
476
|
default=False,
|
|
477
477
|
description="Whether to include table lineage in the ingestion. "
|
|
478
478
|
"This requires to have the table lineage feature enabled.",
|
|
479
479
|
)
|
|
480
480
|
|
|
481
|
-
include_view_lineage = Field(
|
|
481
|
+
include_view_lineage: bool = Field(
|
|
482
482
|
default=True,
|
|
483
483
|
description="Whether to include view lineage in the ingestion. "
|
|
484
484
|
"This requires to have the view lineage feature enabled.",
|
|
485
485
|
)
|
|
486
486
|
|
|
487
|
-
include_queries = Field(
|
|
487
|
+
include_queries: bool = Field(
|
|
488
488
|
default=True,
|
|
489
489
|
description="Whether to generate query entities for SQL queries. "
|
|
490
490
|
"Query entities provide metadata about individual SQL queries including "
|
|
@@ -18,6 +18,7 @@ from sqlalchemy.types import TypeEngine
|
|
|
18
18
|
from trino.sqlalchemy import datatype
|
|
19
19
|
from trino.sqlalchemy.dialect import TrinoDialect
|
|
20
20
|
|
|
21
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
21
22
|
from datahub.configuration.source_common import (
|
|
22
23
|
EnvConfigMixin,
|
|
23
24
|
PlatformInstanceConfigMixin,
|
|
@@ -222,7 +223,7 @@ class ConnectorDetail(PlatformInstanceConfigMixin, EnvConfigMixin):
|
|
|
222
223
|
|
|
223
224
|
class TrinoConfig(BasicSQLAlchemyConfig):
|
|
224
225
|
# defaults
|
|
225
|
-
scheme: str = Field(default="trino"
|
|
226
|
+
scheme: HiddenFromDocs[str] = Field(default="trino")
|
|
226
227
|
database: str = Field(description="database (catalog)")
|
|
227
228
|
|
|
228
229
|
catalog_to_connector_details: Dict[str, ConnectorDetail] = Field(
|
|
@@ -7,7 +7,7 @@ from sqlalchemy import create_engine, inspect
|
|
|
7
7
|
from sqlalchemy.engine import URL
|
|
8
8
|
from sqlalchemy.engine.reflection import Inspector
|
|
9
9
|
|
|
10
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
10
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
11
11
|
from datahub.configuration.validate_field_rename import pydantic_renamed_field
|
|
12
12
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
13
13
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
|
@@ -27,11 +27,10 @@ class TwoTierSQLAlchemyConfig(BasicSQLAlchemyConfig):
|
|
|
27
27
|
default=AllowDenyPattern.allow_all(),
|
|
28
28
|
description="Regex patterns for databases to filter in ingestion.",
|
|
29
29
|
)
|
|
30
|
-
schema_pattern: AllowDenyPattern = Field(
|
|
30
|
+
schema_pattern: HiddenFromDocs[AllowDenyPattern] = Field(
|
|
31
31
|
# The superclass contains a `schema_pattern` field, so we need this here
|
|
32
32
|
# to override the documentation.
|
|
33
33
|
default=AllowDenyPattern.allow_all(),
|
|
34
|
-
hidden_from_docs=True,
|
|
35
34
|
description="Deprecated in favour of database_pattern.",
|
|
36
35
|
)
|
|
37
36
|
|
|
@@ -5,7 +5,7 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tupl
|
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
import pytest
|
|
8
|
-
from pydantic
|
|
8
|
+
from pydantic import validator
|
|
9
9
|
from vertica_sqlalchemy_dialect.base import VerticaInspector
|
|
10
10
|
|
|
11
11
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -8,6 +8,7 @@ from typing import ClassVar, Iterable, List, Optional, Union
|
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel, Field, validator
|
|
10
10
|
|
|
11
|
+
from datahub.configuration.common import HiddenFromDocs
|
|
11
12
|
from datahub.configuration.datetimes import parse_user_datetime
|
|
12
13
|
from datahub.configuration.source_common import (
|
|
13
14
|
EnvConfigMixin,
|
|
@@ -66,22 +67,21 @@ class SqlQueriesSourceConfig(
|
|
|
66
67
|
default=BaseUsageConfig(),
|
|
67
68
|
)
|
|
68
69
|
|
|
69
|
-
use_schema_resolver: bool = Field(
|
|
70
|
+
use_schema_resolver: HiddenFromDocs[bool] = Field(
|
|
71
|
+
True,
|
|
70
72
|
description="Read SchemaMetadata aspects from DataHub to aid in SQL parsing. Turn off only for testing.",
|
|
71
|
-
default=True,
|
|
72
|
-
hidden_from_docs=True,
|
|
73
73
|
)
|
|
74
74
|
default_db: Optional[str] = Field(
|
|
75
|
+
None,
|
|
75
76
|
description="The default database to use for unqualified table names",
|
|
76
|
-
default=None,
|
|
77
77
|
)
|
|
78
78
|
default_schema: Optional[str] = Field(
|
|
79
|
+
None,
|
|
79
80
|
description="The default schema to use for unqualified table names",
|
|
80
|
-
default=None,
|
|
81
81
|
)
|
|
82
82
|
override_dialect: Optional[str] = Field(
|
|
83
|
+
None,
|
|
83
84
|
description="The SQL dialect to use when parsing queries. Overrides automatic dialect detection.",
|
|
84
|
-
default=None,
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
|
|
@@ -68,7 +68,11 @@ class CheckpointStateBase(ConfigModel):
|
|
|
68
68
|
|
|
69
69
|
@staticmethod
|
|
70
70
|
def _to_bytes_utf8(model: ConfigModel) -> bytes:
|
|
71
|
-
|
|
71
|
+
pydantic_json = model.model_dump_json(exclude={"version", "serde"})
|
|
72
|
+
# We decode and re-encode so that Python's default whitespace is included.
|
|
73
|
+
# This is purely to keep tests consistent as we migrate to pydantic v2,
|
|
74
|
+
# and can be removed once we're fully migrated.
|
|
75
|
+
return json.dumps(json.loads(pydantic_json)).encode("utf-8")
|
|
72
76
|
|
|
73
77
|
@staticmethod
|
|
74
78
|
def _to_bytes_base85_json(
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Any, Dict, Iterable, List, Tuple, Type
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Tuple, Type
|
|
2
2
|
|
|
3
3
|
import pydantic
|
|
4
4
|
|
|
@@ -8,13 +8,16 @@ from datahub.utilities.checkpoint_state_util import CheckpointStateUtil
|
|
|
8
8
|
from datahub.utilities.dedup_list import deduplicate_list
|
|
9
9
|
from datahub.utilities.urns.urn import guess_entity_type
|
|
10
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from pydantic.deprecated.class_validators import V1RootValidator
|
|
13
|
+
|
|
11
14
|
STATEFUL_INGESTION_IGNORED_ENTITY_TYPES = {
|
|
12
15
|
"dataProcessInstance",
|
|
13
16
|
"query",
|
|
14
17
|
}
|
|
15
18
|
|
|
16
19
|
|
|
17
|
-
def pydantic_state_migrator(mapping: Dict[str, str]) ->
|
|
20
|
+
def pydantic_state_migrator(mapping: Dict[str, str]) -> "V1RootValidator":
|
|
18
21
|
# mapping would be something like:
|
|
19
22
|
# {
|
|
20
23
|
# 'encoded_view_urns': 'dataset',
|
|
@@ -10,6 +10,7 @@ from datahub.configuration.common import (
|
|
|
10
10
|
ConfigModel,
|
|
11
11
|
ConfigurationError,
|
|
12
12
|
DynamicTypedConfig,
|
|
13
|
+
HiddenFromDocs,
|
|
13
14
|
)
|
|
14
15
|
from datahub.configuration.pydantic_migration_helpers import GenericModel
|
|
15
16
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
|
@@ -55,25 +56,21 @@ class StatefulIngestionConfig(ConfigModel):
|
|
|
55
56
|
description="Whether or not to enable stateful ingest. "
|
|
56
57
|
"Default: True if a pipeline_name is set and either a datahub-rest sink or `datahub_api` is specified, otherwise False",
|
|
57
58
|
)
|
|
58
|
-
max_checkpoint_state_size: pydantic.PositiveInt = Field(
|
|
59
|
+
max_checkpoint_state_size: HiddenFromDocs[pydantic.PositiveInt] = Field(
|
|
59
60
|
default=2**24, # 16 MB
|
|
60
61
|
description="The maximum size of the checkpoint state in bytes. Default is 16MB",
|
|
61
|
-
hidden_from_docs=True,
|
|
62
62
|
)
|
|
63
|
-
state_provider: Optional[DynamicTypedStateProviderConfig] = Field(
|
|
63
|
+
state_provider: HiddenFromDocs[Optional[DynamicTypedStateProviderConfig]] = Field(
|
|
64
64
|
default=None,
|
|
65
65
|
description="The ingestion state provider configuration.",
|
|
66
|
-
hidden_from_docs=True,
|
|
67
66
|
)
|
|
68
|
-
ignore_old_state: bool = Field(
|
|
67
|
+
ignore_old_state: HiddenFromDocs[bool] = Field(
|
|
69
68
|
default=False,
|
|
70
69
|
description="If set to True, ignores the previous checkpoint state.",
|
|
71
|
-
hidden_from_docs=True,
|
|
72
70
|
)
|
|
73
|
-
ignore_new_state: bool = Field(
|
|
71
|
+
ignore_new_state: HiddenFromDocs[bool] = Field(
|
|
74
72
|
default=False,
|
|
75
73
|
description="If set to True, ignores the current checkpoint state.",
|
|
76
|
-
hidden_from_docs=True,
|
|
77
74
|
)
|
|
78
75
|
|
|
79
76
|
@pydantic.root_validator(skip_on_failure=True)
|