acryl-datahub 1.2.0.10rc2__py3-none-any.whl → 1.2.0.10rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/METADATA +2525 -2609
- {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/RECORD +93 -93
- datahub/_version.py +1 -1
- datahub/api/entities/assertion/assertion.py +1 -1
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/dataproduct/dataproduct.py +6 -3
- datahub/api/entities/dataset/dataset.py +9 -18
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/api/graphql/operation.py +10 -6
- datahub/cli/docker_check.py +2 -2
- datahub/configuration/common.py +29 -1
- datahub/configuration/connection_resolver.py +5 -2
- datahub/configuration/import_resolver.py +7 -4
- datahub/configuration/pydantic_migration_helpers.py +0 -9
- datahub/configuration/source_common.py +3 -2
- datahub/configuration/validate_field_deprecation.py +5 -2
- datahub/configuration/validate_field_removal.py +5 -2
- datahub/configuration/validate_field_rename.py +6 -5
- datahub/configuration/validate_multiline_string.py +5 -2
- datahub/ingestion/run/pipeline_config.py +2 -2
- datahub/ingestion/source/azure/azure_common.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +28 -14
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +4 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +3 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +16 -16
- datahub/ingestion/source/datahub/config.py +8 -9
- datahub/ingestion/source/delta_lake/config.py +1 -1
- datahub/ingestion/source/dremio/dremio_config.py +3 -4
- datahub/ingestion/source/feast.py +8 -10
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/ge_profiling_config.py +26 -22
- datahub/ingestion/source/grafana/grafana_config.py +2 -2
- datahub/ingestion/source/grafana/models.py +12 -14
- datahub/ingestion/source/hex/hex.py +6 -1
- datahub/ingestion/source/iceberg/iceberg_profiler.py +4 -2
- datahub/ingestion/source/kafka_connect/common.py +2 -2
- datahub/ingestion/source/looker/looker_common.py +55 -75
- datahub/ingestion/source/looker/looker_config.py +15 -4
- datahub/ingestion/source/looker/looker_source.py +445 -548
- datahub/ingestion/source/looker/lookml_config.py +1 -1
- datahub/ingestion/source/metadata/business_glossary.py +7 -7
- datahub/ingestion/source/metadata/lineage.py +1 -1
- datahub/ingestion/source/mode.py +13 -5
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +14 -21
- datahub/ingestion/source/preset.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +28 -8
- datahub/ingestion/source/redshift/config.py +6 -3
- datahub/ingestion/source/salesforce.py +13 -9
- datahub/ingestion/source/schema/json_schema.py +14 -14
- datahub/ingestion/source/sigma/data_classes.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +12 -15
- datahub/ingestion/source/snowflake/snowflake_connection.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +15 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +4 -5
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +12 -7
- datahub/ingestion/source/sql/cockroachdb.py +5 -3
- datahub/ingestion/source/sql/druid.py +2 -2
- datahub/ingestion/source/sql/hive.py +4 -3
- datahub/ingestion/source/sql/hive_metastore.py +7 -9
- datahub/ingestion/source/sql/mssql/source.py +2 -2
- datahub/ingestion/source/sql/mysql.py +2 -2
- datahub/ingestion/source/sql/oracle.py +3 -3
- datahub/ingestion/source/sql/presto.py +2 -1
- datahub/ingestion/source/sql/teradata.py +4 -4
- datahub/ingestion/source/sql/trino.py +2 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +2 -3
- datahub/ingestion/source/sql/vertica.py +1 -1
- datahub/ingestion/source/sql_queries.py +6 -6
- datahub/ingestion/source/state/checkpoint.py +5 -1
- datahub/ingestion/source/state/entity_removal_state.py +5 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +5 -8
- datahub/ingestion/source/superset.py +29 -4
- datahub/ingestion/source/tableau/tableau.py +65 -11
- datahub/ingestion/source/tableau/tableau_common.py +5 -0
- datahub/ingestion/source/tableau/tableau_constant.py +1 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +3 -0
- datahub/ingestion/source/unity/config.py +7 -3
- datahub/ingestion/source/usage/usage_common.py +3 -3
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/metadata/_internal_schema_classes.py +45 -1
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/schema.avsc +24 -1
- datahub/metadata/schemas/InstitutionalMemory.avsc +22 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +22 -0
- datahub/sdk/dashboard.py +0 -2
- datahub/sdk/search_filters.py +1 -7
- {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.10rc2.dist-info → acryl_datahub-1.2.0.10rc4.dist-info}/top_level.txt +0 -0
|
@@ -11,7 +11,7 @@ from cached_property import cached_property
|
|
|
11
11
|
from pydantic.fields import Field
|
|
12
12
|
from wcmatch import pathlib
|
|
13
13
|
|
|
14
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
14
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
15
15
|
from datahub.ingestion.source.aws.s3_util import is_s3_uri
|
|
16
16
|
from datahub.ingestion.source.azure.abs_utils import is_abs_uri
|
|
17
17
|
from datahub.ingestion.source.gcs.gcs_utils import is_gcs_uri
|
|
@@ -89,63 +89,62 @@ class PathSpec(ConfigModel):
|
|
|
89
89
|
description="Path to table. Name variable `{table}` is used to mark the folder with dataset. In absence of `{table}`, file level dataset will be created. Check below examples for more details."
|
|
90
90
|
)
|
|
91
91
|
exclude: Optional[List[str]] = Field(
|
|
92
|
-
|
|
92
|
+
[],
|
|
93
93
|
description="list of paths in glob pattern which will be excluded while scanning for the datasets",
|
|
94
94
|
)
|
|
95
95
|
file_types: List[str] = Field(
|
|
96
|
-
|
|
96
|
+
SUPPORTED_FILE_TYPES,
|
|
97
97
|
description="Files with extenstions specified here (subset of default value) only will be scanned to create dataset. Other files will be omitted.",
|
|
98
98
|
)
|
|
99
99
|
|
|
100
100
|
default_extension: Optional[str] = Field(
|
|
101
|
-
|
|
101
|
+
None,
|
|
102
102
|
description="For files without extension it will assume the specified file type. If it is not set the files without extensions will be skipped.",
|
|
103
103
|
)
|
|
104
104
|
|
|
105
105
|
table_name: Optional[str] = Field(
|
|
106
|
-
|
|
106
|
+
None,
|
|
107
107
|
description="Display name of the dataset.Combination of named variables from include path and strings",
|
|
108
108
|
)
|
|
109
109
|
|
|
110
110
|
# This is not used yet, but will be used in the future to sort the partitions
|
|
111
|
-
sort_key: Optional[SortKey] = Field(
|
|
112
|
-
|
|
113
|
-
default=None,
|
|
111
|
+
sort_key: HiddenFromDocs[Optional[SortKey]] = Field(
|
|
112
|
+
None,
|
|
114
113
|
description="Sort key to use when sorting the partitions. This is useful when the partitions are not sorted in the order of the data. The key can be a compound key based on the path_spec variables.",
|
|
115
114
|
)
|
|
116
115
|
|
|
117
116
|
enable_compression: bool = Field(
|
|
118
|
-
|
|
117
|
+
True,
|
|
119
118
|
description="Enable or disable processing compressed files. Currently .gz and .bz files are supported.",
|
|
120
119
|
)
|
|
121
120
|
|
|
122
121
|
sample_files: bool = Field(
|
|
123
|
-
|
|
122
|
+
True,
|
|
124
123
|
description="Not listing all the files but only taking a handful amount of sample file to infer the schema. File count and file size calculation will be disabled. This can affect performance significantly if enabled",
|
|
125
124
|
)
|
|
126
125
|
|
|
127
126
|
allow_double_stars: bool = Field(
|
|
128
|
-
|
|
127
|
+
False,
|
|
129
128
|
description="Allow double stars in the include path. This can affect performance significantly if enabled",
|
|
130
129
|
)
|
|
131
130
|
|
|
132
131
|
autodetect_partitions: bool = Field(
|
|
133
|
-
|
|
132
|
+
True,
|
|
134
133
|
description="Autodetect partition(s) from the path. If set to true, it will autodetect partition key/value if the folder format is {partition_key}={partition_value} for example `year=2024`",
|
|
135
134
|
)
|
|
136
135
|
|
|
137
136
|
traversal_method: FolderTraversalMethod = Field(
|
|
138
|
-
|
|
137
|
+
FolderTraversalMethod.MAX,
|
|
139
138
|
description="Method to traverse the folder. ALL: Traverse all the folders, MIN_MAX: Traverse the folders by finding min and max value, MAX: Traverse the folder with max value",
|
|
140
139
|
)
|
|
141
140
|
|
|
142
141
|
include_hidden_folders: bool = Field(
|
|
143
|
-
|
|
142
|
+
False,
|
|
144
143
|
description="Include hidden folders in the traversal (folders starting with . or _",
|
|
145
144
|
)
|
|
146
145
|
|
|
147
146
|
tables_filter_pattern: AllowDenyPattern = Field(
|
|
148
|
-
|
|
147
|
+
AllowDenyPattern.allow_all(),
|
|
149
148
|
description="The tables_filter_pattern configuration field uses regular expressions to filter the tables part of the Pathspec for ingestion, allowing fine-grained control over which tables are included or excluded based on specified patterns. The default setting allows all tables.",
|
|
150
149
|
)
|
|
151
150
|
|
|
@@ -479,7 +478,8 @@ class PathSpec(ConfigModel):
|
|
|
479
478
|
return glob_include
|
|
480
479
|
|
|
481
480
|
@pydantic.root_validator(skip_on_failure=True)
|
|
482
|
-
|
|
481
|
+
@staticmethod
|
|
482
|
+
def validate_path_spec(values: Dict) -> Dict[str, Any]:
|
|
483
483
|
# validate that main fields are populated
|
|
484
484
|
required_fields = ["include", "file_types", "default_extension"]
|
|
485
485
|
for f in required_fields:
|
|
@@ -4,7 +4,7 @@ from typing import Optional, Set
|
|
|
4
4
|
import pydantic
|
|
5
5
|
from pydantic import Field, root_validator
|
|
6
6
|
|
|
7
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
8
8
|
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
|
|
9
9
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConnectionConfig
|
|
10
10
|
from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
@@ -98,16 +98,14 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
98
98
|
),
|
|
99
99
|
)
|
|
100
100
|
|
|
101
|
-
pull_from_datahub_api: bool = Field(
|
|
101
|
+
pull_from_datahub_api: HiddenFromDocs[bool] = Field(
|
|
102
102
|
default=False,
|
|
103
103
|
description="Use the DataHub API to fetch versioned aspects.",
|
|
104
|
-
hidden_from_docs=True,
|
|
105
104
|
)
|
|
106
105
|
|
|
107
|
-
max_workers: int = Field(
|
|
106
|
+
max_workers: HiddenFromDocs[int] = Field(
|
|
108
107
|
default=5 * (os.cpu_count() or 4),
|
|
109
108
|
description="Number of worker threads to use for datahub api ingestion.",
|
|
110
|
-
hidden_from_docs=True,
|
|
111
109
|
)
|
|
112
110
|
|
|
113
111
|
urn_pattern: AllowDenyPattern = Field(default=AllowDenyPattern())
|
|
@@ -118,10 +116,11 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
118
116
|
"Useful if the source system has duplicate field paths in the db, but we're pushing to a system with server-side duplicate checking.",
|
|
119
117
|
)
|
|
120
118
|
|
|
121
|
-
structured_properties_template_cache_invalidation_interval: int =
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
119
|
+
structured_properties_template_cache_invalidation_interval: HiddenFromDocs[int] = (
|
|
120
|
+
Field(
|
|
121
|
+
default=60,
|
|
122
|
+
description="Interval in seconds to invalidate the structured properties template cache.",
|
|
123
|
+
)
|
|
125
124
|
)
|
|
126
125
|
|
|
127
126
|
query_timeout: Optional[int] = Field(
|
|
@@ -4,7 +4,7 @@ from typing import List, Literal, Optional
|
|
|
4
4
|
import certifi
|
|
5
5
|
from pydantic import Field, validator
|
|
6
6
|
|
|
7
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, HiddenFromDocs
|
|
8
8
|
from datahub.configuration.source_common import (
|
|
9
9
|
EnvConfigMixin,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
@@ -100,10 +100,9 @@ class ProfileConfig(GEProfilingBaseConfig):
|
|
|
100
100
|
query_timeout: int = Field(
|
|
101
101
|
default=300, description="Time before cancelling Dremio profiling query"
|
|
102
102
|
)
|
|
103
|
-
include_field_median_value: bool = Field(
|
|
103
|
+
include_field_median_value: HiddenFromDocs[bool] = Field(
|
|
104
|
+
# Hidden because median causes a number of issues in Dremio.
|
|
104
105
|
default=False,
|
|
105
|
-
hidden_from_docs=True,
|
|
106
|
-
description="Median causes a number of issues in Dremio.",
|
|
107
106
|
)
|
|
108
107
|
|
|
109
108
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import pathlib
|
|
1
2
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
|
3
|
+
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Union
|
|
3
4
|
|
|
4
5
|
import feast.types
|
|
5
6
|
from feast import (
|
|
@@ -97,7 +98,7 @@ class FeastRepositorySourceConfig(
|
|
|
97
98
|
StatefulIngestionConfigBase,
|
|
98
99
|
):
|
|
99
100
|
path: str = Field(description="Path to Feast repository")
|
|
100
|
-
fs_yaml_file: Optional[
|
|
101
|
+
fs_yaml_file: Optional[pathlib.Path] = Field(
|
|
101
102
|
default=None,
|
|
102
103
|
description="Path to the `feature_store.yaml` file used to configure the feature store",
|
|
103
104
|
)
|
|
@@ -142,17 +143,14 @@ class FeastRepositorySource(StatefulIngestionSourceBase):
|
|
|
142
143
|
- Column types associated with each entity and feature
|
|
143
144
|
"""
|
|
144
145
|
|
|
145
|
-
platform = "feast"
|
|
146
|
-
source_config: FeastRepositorySourceConfig
|
|
147
|
-
report: StaleEntityRemovalSourceReport
|
|
148
|
-
feature_store: FeatureStore
|
|
146
|
+
platform: ClassVar[str] = "feast"
|
|
149
147
|
|
|
150
148
|
def __init__(self, config: FeastRepositorySourceConfig, ctx: PipelineContext):
|
|
151
149
|
super().__init__(config, ctx)
|
|
152
|
-
self.source_config = config
|
|
153
|
-
self.ctx = ctx
|
|
154
|
-
self.report = StaleEntityRemovalSourceReport()
|
|
155
|
-
self.feature_store = FeatureStore(
|
|
150
|
+
self.source_config: FeastRepositorySourceConfig = config
|
|
151
|
+
self.ctx: PipelineContext = ctx
|
|
152
|
+
self.report: StaleEntityRemovalSourceReport = StaleEntityRemovalSourceReport()
|
|
153
|
+
self.feature_store: FeatureStore = FeatureStore(
|
|
156
154
|
repo_path=self.source_config.path,
|
|
157
155
|
fs_yaml_file=self.source_config.fs_yaml_file,
|
|
158
156
|
)
|
|
@@ -102,7 +102,7 @@ class FivetranLogConfig(ConfigModel):
|
|
|
102
102
|
"destination_config", "snowflake_destination_config"
|
|
103
103
|
)
|
|
104
104
|
|
|
105
|
-
@root_validator(
|
|
105
|
+
@root_validator(skip_on_failure=True)
|
|
106
106
|
def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
|
|
107
107
|
destination_platform = values["destination_platform"]
|
|
108
108
|
if destination_platform == "snowflake":
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Annotated, Any, Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
import pydantic
|
|
7
7
|
from pydantic.fields import Field
|
|
8
8
|
|
|
9
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
9
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, SupportedSources
|
|
10
10
|
from datahub.ingestion.source_config.operation_config import OperationConfig
|
|
11
11
|
|
|
12
12
|
_PROFILING_FLAGS_TO_REPORT = {
|
|
@@ -120,37 +120,40 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
120
120
|
"number of columns to profile goes up.",
|
|
121
121
|
)
|
|
122
122
|
|
|
123
|
-
profile_if_updated_since_days:
|
|
123
|
+
profile_if_updated_since_days: Annotated[
|
|
124
|
+
Optional[pydantic.PositiveFloat], SupportedSources(["snowflake", "bigquery"])
|
|
125
|
+
] = Field(
|
|
124
126
|
default=None,
|
|
125
127
|
description="Profile table only if it has been updated since these many number of days. "
|
|
126
128
|
"If set to `null`, no constraint of last modified time for tables to profile. "
|
|
127
129
|
"Supported only in `snowflake` and `BigQuery`.",
|
|
128
|
-
schema_extra={"supported_sources": ["snowflake", "bigquery"]},
|
|
129
130
|
)
|
|
130
131
|
|
|
131
|
-
profile_table_size_limit:
|
|
132
|
+
profile_table_size_limit: Annotated[
|
|
133
|
+
Optional[int],
|
|
134
|
+
SupportedSources(["snowflake", "bigquery", "unity-catalog", "oracle"]),
|
|
135
|
+
] = Field(
|
|
132
136
|
default=5,
|
|
133
137
|
description="Profile tables only if their size is less than specified GBs. If set to `null`, "
|
|
134
138
|
"no limit on the size of tables to profile. Supported only in `Snowflake`, `BigQuery` and "
|
|
135
139
|
"`Databricks`. Supported for `Oracle` based on calculated size from gathered stats.",
|
|
136
|
-
schema_extra={
|
|
137
|
-
"supported_sources": ["snowflake", "bigquery", "unity-catalog", "oracle"]
|
|
138
|
-
},
|
|
139
140
|
)
|
|
140
141
|
|
|
141
|
-
profile_table_row_limit:
|
|
142
|
+
profile_table_row_limit: Annotated[
|
|
143
|
+
Optional[int], SupportedSources(["snowflake", "bigquery", "oracle"])
|
|
144
|
+
] = Field(
|
|
142
145
|
default=5000000,
|
|
143
146
|
description="Profile tables only if their row count is less than specified count. "
|
|
144
147
|
"If set to `null`, no limit on the row count of tables to profile. Supported only in "
|
|
145
148
|
"`Snowflake`, `BigQuery`. Supported for `Oracle` based on gathered stats.",
|
|
146
|
-
schema_extra={"supported_sources": ["snowflake", "bigquery", "oracle"]},
|
|
147
149
|
)
|
|
148
150
|
|
|
149
|
-
profile_table_row_count_estimate_only:
|
|
151
|
+
profile_table_row_count_estimate_only: Annotated[
|
|
152
|
+
bool, SupportedSources(["postgres", "mysql"])
|
|
153
|
+
] = Field(
|
|
150
154
|
default=False,
|
|
151
155
|
description="Use an approximate query for row count. This will be much faster but slightly "
|
|
152
156
|
"less accurate. Only supported for Postgres and MySQL. ",
|
|
153
|
-
schema_extra={"supported_sources": ["postgres", "mysql"]},
|
|
154
157
|
)
|
|
155
158
|
|
|
156
159
|
# The query combiner enables us to combine multiple queries into a single query,
|
|
@@ -163,36 +166,37 @@ class GEProfilingConfig(GEProfilingBaseConfig):
|
|
|
163
166
|
# Hidden option - used for debugging purposes.
|
|
164
167
|
catch_exceptions: bool = Field(default=True, description="")
|
|
165
168
|
|
|
166
|
-
partition_profiling_enabled:
|
|
169
|
+
partition_profiling_enabled: Annotated[
|
|
170
|
+
bool, SupportedSources(["athena", "bigquery"])
|
|
171
|
+
] = Field(
|
|
167
172
|
default=True,
|
|
168
173
|
description="Whether to profile partitioned tables. Only BigQuery and Aws Athena supports this. "
|
|
169
174
|
"If enabled, latest partition data is used for profiling.",
|
|
170
|
-
schema_extra={"supported_sources": ["athena", "bigquery"]},
|
|
171
175
|
)
|
|
172
|
-
partition_datetime:
|
|
176
|
+
partition_datetime: Annotated[
|
|
177
|
+
Optional[datetime.datetime], SupportedSources(["bigquery"])
|
|
178
|
+
] = Field(
|
|
173
179
|
default=None,
|
|
174
180
|
description="If specified, profile only the partition which matches this datetime. "
|
|
175
181
|
"If not specified, profile the latest partition. Only Bigquery supports this.",
|
|
176
|
-
schema_extra={"supported_sources": ["bigquery"]},
|
|
177
182
|
)
|
|
178
|
-
use_sampling: bool = Field(
|
|
183
|
+
use_sampling: Annotated[bool, SupportedSources(["bigquery", "snowflake"])] = Field(
|
|
179
184
|
default=True,
|
|
180
185
|
description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. "
|
|
181
186
|
"If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ",
|
|
182
|
-
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
|
|
183
187
|
)
|
|
184
188
|
|
|
185
|
-
sample_size: int = Field(
|
|
189
|
+
sample_size: Annotated[int, SupportedSources(["bigquery", "snowflake"])] = Field(
|
|
186
190
|
default=10000,
|
|
187
191
|
description="Number of rows to be sampled from table for column level profiling."
|
|
188
192
|
"Applicable only if `use_sampling` is set to True.",
|
|
189
|
-
schema_extra={"supported_sources": ["bigquery", "snowflake"]},
|
|
190
193
|
)
|
|
191
194
|
|
|
192
|
-
profile_external_tables:
|
|
195
|
+
profile_external_tables: Annotated[
|
|
196
|
+
bool, SupportedSources(["redshift", "snowflake"])
|
|
197
|
+
] = Field(
|
|
193
198
|
default=False,
|
|
194
199
|
description="Whether to profile external tables. Only Snowflake and Redshift supports this.",
|
|
195
|
-
schema_extra={"supported_sources": ["redshift", "snowflake"]},
|
|
196
200
|
)
|
|
197
201
|
|
|
198
202
|
tags_to_ignore_sampling: Optional[List[str]] = pydantic.Field(
|
|
@@ -2,7 +2,7 @@ from typing import Dict, Optional
|
|
|
2
2
|
|
|
3
3
|
from pydantic import Field, SecretStr, validator
|
|
4
4
|
|
|
5
|
-
from datahub.configuration.common import AllowDenyPattern
|
|
5
|
+
from datahub.configuration.common import AllowDenyPattern, HiddenFromDocs
|
|
6
6
|
from datahub.configuration.source_common import (
|
|
7
7
|
DatasetLineageProviderConfigBase,
|
|
8
8
|
EnvConfigMixin,
|
|
@@ -37,7 +37,7 @@ class GrafanaSourceConfig(
|
|
|
37
37
|
):
|
|
38
38
|
"""Configuration for Grafana source"""
|
|
39
39
|
|
|
40
|
-
platform: str = Field(default="grafana"
|
|
40
|
+
platform: HiddenFromDocs[str] = Field(default="grafana")
|
|
41
41
|
url: str = Field(
|
|
42
42
|
description="Grafana URL in the format http://your-grafana-instance with no trailing slash"
|
|
43
43
|
)
|
|
@@ -10,9 +10,8 @@ References:
|
|
|
10
10
|
|
|
11
11
|
from typing import Any, Dict, List, Optional
|
|
12
12
|
|
|
13
|
-
from pydantic import BaseModel, Field
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
14
14
|
|
|
15
|
-
from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
|
|
16
15
|
from datahub.emitter.mcp_builder import ContainerKey
|
|
17
16
|
|
|
18
17
|
# Grafana-specific type definitions for better type safety
|
|
@@ -25,7 +24,11 @@ GrafanaFieldConfig = Dict[
|
|
|
25
24
|
GrafanaTransformation = Dict[str, Any] # Transformations: id, options
|
|
26
25
|
|
|
27
26
|
|
|
28
|
-
class
|
|
27
|
+
class _GrafanaBaseModel(BaseModel):
|
|
28
|
+
model_config = ConfigDict(coerce_numbers_to_str=True)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DatasourceRef(_GrafanaBaseModel):
|
|
29
32
|
"""Reference to a Grafana datasource."""
|
|
30
33
|
|
|
31
34
|
type: Optional[str] = None # Datasource type (prometheus, mysql, postgres, etc.)
|
|
@@ -33,13 +36,13 @@ class DatasourceRef(BaseModel):
|
|
|
33
36
|
name: Optional[str] = None # Datasource display name
|
|
34
37
|
|
|
35
38
|
|
|
36
|
-
class Panel(
|
|
39
|
+
class Panel(_GrafanaBaseModel):
|
|
37
40
|
"""Represents a Grafana dashboard panel."""
|
|
38
41
|
|
|
39
42
|
id: str
|
|
40
43
|
title: str
|
|
41
44
|
description: str = ""
|
|
42
|
-
type: Optional[str]
|
|
45
|
+
type: Optional[str] = None
|
|
43
46
|
# Query targets - each contains refId (A,B,C...), query/expr, datasource ref, etc.
|
|
44
47
|
query_targets: List[GrafanaQueryTarget] = Field(
|
|
45
48
|
default_factory=list, alias="targets"
|
|
@@ -52,16 +55,16 @@ class Panel(BaseModel):
|
|
|
52
55
|
transformations: List[GrafanaTransformation] = Field(default_factory=list)
|
|
53
56
|
|
|
54
57
|
|
|
55
|
-
class Dashboard(
|
|
58
|
+
class Dashboard(_GrafanaBaseModel):
|
|
56
59
|
"""Represents a Grafana dashboard."""
|
|
57
60
|
|
|
58
61
|
uid: str
|
|
59
62
|
title: str
|
|
60
63
|
description: str = ""
|
|
61
|
-
version: Optional[str]
|
|
64
|
+
version: Optional[str] = None
|
|
62
65
|
panels: List[Panel]
|
|
63
66
|
tags: List[str]
|
|
64
|
-
timezone: Optional[str]
|
|
67
|
+
timezone: Optional[str] = None
|
|
65
68
|
refresh: Optional[str] = None
|
|
66
69
|
schema_version: Optional[str] = Field(default=None, alias="schemaVersion")
|
|
67
70
|
folder_id: Optional[str] = Field(default=None, alias="meta.folderId")
|
|
@@ -100,18 +103,13 @@ class Dashboard(BaseModel):
|
|
|
100
103
|
return super().parse_obj(dashboard_dict)
|
|
101
104
|
|
|
102
105
|
|
|
103
|
-
class Folder(
|
|
106
|
+
class Folder(_GrafanaBaseModel):
|
|
104
107
|
"""Represents a Grafana folder."""
|
|
105
108
|
|
|
106
109
|
id: str
|
|
107
110
|
title: str
|
|
108
111
|
description: Optional[str] = ""
|
|
109
112
|
|
|
110
|
-
if PYDANTIC_VERSION_2:
|
|
111
|
-
from pydantic import ConfigDict
|
|
112
|
-
|
|
113
|
-
model_config = ConfigDict(coerce_numbers_to_str=True) # type: ignore
|
|
114
|
-
|
|
115
113
|
|
|
116
114
|
class FolderKey(ContainerKey):
|
|
117
115
|
"""Key for identifying a Grafana folder."""
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
1
2
|
from dataclasses import dataclass
|
|
2
3
|
from datetime import datetime, timedelta, timezone
|
|
3
4
|
from typing import Any, Dict, Iterable, List, Optional
|
|
@@ -122,7 +123,11 @@ class HexSourceConfig(
|
|
|
122
123
|
|
|
123
124
|
@root_validator(pre=True)
|
|
124
125
|
def validate_lineage_times(cls, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
125
|
-
#
|
|
126
|
+
# In-place update of the input dict would cause state contamination. This was discovered through test failures
|
|
127
|
+
# in test_hex.py where the same dict is reused.
|
|
128
|
+
# So a deepcopy is performed first.
|
|
129
|
+
data = deepcopy(data)
|
|
130
|
+
|
|
126
131
|
if "lineage_end_time" not in data or data["lineage_end_time"] is None:
|
|
127
132
|
data["lineage_end_time"] = datetime.now(tz=timezone.utc)
|
|
128
133
|
# if string is given, parse it
|
|
@@ -12,6 +12,7 @@ from pyiceberg.types import (
|
|
|
12
12
|
IcebergType,
|
|
13
13
|
IntegerType,
|
|
14
14
|
LongType,
|
|
15
|
+
PrimitiveType,
|
|
15
16
|
TimestampType,
|
|
16
17
|
TimestamptzType,
|
|
17
18
|
TimeType,
|
|
@@ -22,6 +23,7 @@ from pyiceberg.utils.datetime import (
|
|
|
22
23
|
to_human_timestamp,
|
|
23
24
|
to_human_timestamptz,
|
|
24
25
|
)
|
|
26
|
+
from typing_extensions import TypeGuard
|
|
25
27
|
|
|
26
28
|
from datahub.emitter.mce_builder import get_sys_time
|
|
27
29
|
from datahub.ingestion.source.iceberg.iceberg_common import (
|
|
@@ -65,7 +67,7 @@ class IcebergProfiler:
|
|
|
65
67
|
aggregated_values: Dict[int, Any],
|
|
66
68
|
manifest_values: Dict[int, bytes],
|
|
67
69
|
) -> None:
|
|
68
|
-
for field_id, value_encoded in manifest_values.items():
|
|
70
|
+
for field_id, value_encoded in manifest_values.items():
|
|
69
71
|
try:
|
|
70
72
|
field = schema.find_field(field_id)
|
|
71
73
|
except ValueError:
|
|
@@ -240,7 +242,7 @@ class IcebergProfiler:
|
|
|
240
242
|
return None
|
|
241
243
|
|
|
242
244
|
@staticmethod
|
|
243
|
-
def _is_numeric_type(type: IcebergType) ->
|
|
245
|
+
def _is_numeric_type(type: IcebergType) -> TypeGuard[PrimitiveType]:
|
|
244
246
|
return isinstance(
|
|
245
247
|
type,
|
|
246
248
|
(
|
|
@@ -4,7 +4,7 @@ from typing import Dict, Iterable, List, Optional
|
|
|
4
4
|
|
|
5
5
|
from pydantic.fields import Field
|
|
6
6
|
|
|
7
|
-
from datahub.configuration.common import AllowDenyPattern, ConfigModel
|
|
7
|
+
from datahub.configuration.common import AllowDenyPattern, ConfigModel, LaxStr
|
|
8
8
|
from datahub.configuration.source_common import (
|
|
9
9
|
DatasetLineageProviderConfigBase,
|
|
10
10
|
PlatformInstanceConfigMixin,
|
|
@@ -29,7 +29,7 @@ CONNECTOR_CLASS = "connector.class"
|
|
|
29
29
|
class ProvidedConfig(ConfigModel):
|
|
30
30
|
provider: str
|
|
31
31
|
path_key: str
|
|
32
|
-
value:
|
|
32
|
+
value: LaxStr
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class GenericConnectorConfig(ConfigModel):
|