acryl-datahub 1.3.0.1rc9__py3-none-any.whl → 1.3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/METADATA +2550 -2543
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/RECORD +263 -261
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +2 -2
- datahub/api/entities/corpgroup/corpgroup.py +11 -6
- datahub/api/entities/corpuser/corpuser.py +11 -11
- datahub/api/entities/dataproduct/dataproduct.py +47 -27
- datahub/api/entities/dataset/dataset.py +32 -21
- datahub/api/entities/external/lake_formation_external_entites.py +5 -6
- datahub/api/entities/external/unity_catalog_external_entites.py +5 -7
- datahub/api/entities/forms/forms.py +16 -14
- datahub/api/entities/structuredproperties/structuredproperties.py +23 -16
- datahub/cli/check_cli.py +2 -2
- datahub/cli/config_utils.py +3 -3
- datahub/cli/lite_cli.py +9 -7
- datahub/cli/migrate.py +4 -4
- datahub/cli/quickstart_versioning.py +3 -3
- datahub/cli/specific/group_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +1 -1
- datahub/cli/specific/user_cli.py +1 -1
- datahub/configuration/common.py +14 -2
- datahub/configuration/connection_resolver.py +2 -2
- datahub/configuration/git.py +47 -30
- datahub/configuration/import_resolver.py +2 -2
- datahub/configuration/kafka.py +4 -3
- datahub/configuration/time_window_config.py +26 -26
- datahub/configuration/validate_field_deprecation.py +2 -2
- datahub/configuration/validate_field_removal.py +2 -2
- datahub/configuration/validate_field_rename.py +2 -2
- datahub/configuration/validate_multiline_string.py +2 -1
- datahub/emitter/kafka_emitter.py +3 -1
- datahub/emitter/rest_emitter.py +2 -4
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/report.py +1 -1
- datahub/ingestion/api/sink.py +1 -1
- datahub/ingestion/api/source.py +1 -1
- datahub/ingestion/glossary/datahub_classifier.py +11 -8
- datahub/ingestion/graph/client.py +5 -1
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/reporting/file_reporter.py +5 -4
- datahub/ingestion/run/pipeline.py +7 -6
- datahub/ingestion/run/pipeline_config.py +12 -14
- datahub/ingestion/run/sink_callback.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +6 -4
- datahub/ingestion/source/abs/config.py +19 -19
- datahub/ingestion/source/abs/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/abs/source.py +2 -2
- datahub/ingestion/source/aws/aws_common.py +1 -1
- datahub/ingestion/source/aws/glue.py +6 -4
- datahub/ingestion/source/aws/sagemaker.py +1 -1
- datahub/ingestion/source/azure/azure_common.py +8 -12
- datahub/ingestion/source/bigquery_v2/bigquery.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +43 -30
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -1
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/gcp_credentials_config.py +10 -10
- datahub/ingestion/source/data_lake_common/path_spec.py +85 -89
- datahub/ingestion/source/datahub/config.py +8 -8
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +9 -3
- datahub/ingestion/source/dbt/dbt_common.py +39 -37
- datahub/ingestion/source/dbt/dbt_core.py +10 -12
- datahub/ingestion/source/debug/datahub_debug.py +1 -1
- datahub/ingestion/source/delta_lake/config.py +6 -4
- datahub/ingestion/source/dremio/dremio_api.py +212 -78
- datahub/ingestion/source/dremio/dremio_config.py +10 -6
- datahub/ingestion/source/dremio/dremio_entities.py +55 -39
- datahub/ingestion/source/dremio/dremio_profiling.py +14 -3
- datahub/ingestion/source/dremio/dremio_source.py +24 -26
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -1
- datahub/ingestion/source/elastic_search.py +110 -32
- datahub/ingestion/source/excel/source.py +1 -1
- datahub/ingestion/source/feast.py +1 -1
- datahub/ingestion/source/file.py +5 -4
- datahub/ingestion/source/fivetran/config.py +17 -16
- datahub/ingestion/source/fivetran/fivetran.py +2 -2
- datahub/ingestion/source/gc/datahub_gc.py +1 -1
- datahub/ingestion/source/gcs/gcs_source.py +8 -10
- datahub/ingestion/source/ge_profiling_config.py +8 -5
- datahub/ingestion/source/grafana/grafana_api.py +2 -2
- datahub/ingestion/source/grafana/grafana_config.py +4 -3
- datahub/ingestion/source/grafana/grafana_source.py +1 -1
- datahub/ingestion/source/grafana/models.py +23 -5
- datahub/ingestion/source/hex/api.py +7 -5
- datahub/ingestion/source/hex/hex.py +4 -3
- datahub/ingestion/source/iceberg/iceberg.py +1 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +5 -3
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +10 -10
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +7 -5
- datahub/ingestion/source/looker/looker_config.py +21 -20
- datahub/ingestion/source/looker/lookml_config.py +47 -47
- datahub/ingestion/source/metabase.py +8 -8
- datahub/ingestion/source/metadata/business_glossary.py +2 -2
- datahub/ingestion/source/metadata/lineage.py +13 -8
- datahub/ingestion/source/mlflow.py +1 -1
- datahub/ingestion/source/mode.py +6 -4
- datahub/ingestion/source/mongodb.py +4 -3
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +17 -23
- datahub/ingestion/source/openapi.py +6 -8
- datahub/ingestion/source/powerbi/config.py +33 -32
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +2 -2
- datahub/ingestion/source/powerbi/powerbi.py +1 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +2 -2
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +8 -6
- datahub/ingestion/source/preset.py +8 -8
- datahub/ingestion/source/pulsar.py +1 -1
- datahub/ingestion/source/qlik_sense/data_classes.py +15 -8
- datahub/ingestion/source/qlik_sense/qlik_api.py +7 -7
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +18 -20
- datahub/ingestion/source/redshift/redshift.py +2 -2
- datahub/ingestion/source/redshift/usage.py +23 -3
- datahub/ingestion/source/s3/config.py +83 -62
- datahub/ingestion/source/s3/datalake_profiler_config.py +11 -13
- datahub/ingestion/source/s3/source.py +8 -5
- datahub/ingestion/source/sac/sac.py +5 -4
- datahub/ingestion/source/salesforce.py +3 -2
- datahub/ingestion/source/schema/json_schema.py +2 -2
- datahub/ingestion/source/sigma/data_classes.py +3 -2
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/sigma/sigma_api.py +7 -7
- datahub/ingestion/source/slack/slack.py +1 -1
- datahub/ingestion/source/snaplogic/snaplogic.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_assertion.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +35 -31
- datahub/ingestion/source/snowflake/snowflake_connection.py +35 -13
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +3 -3
- datahub/ingestion/source/snowflake/snowflake_queries.py +28 -4
- datahub/ingestion/source/sql/athena.py +1 -1
- datahub/ingestion/source/sql/clickhouse.py +4 -2
- datahub/ingestion/source/sql/cockroachdb.py +1 -1
- datahub/ingestion/source/sql/druid.py +1 -1
- datahub/ingestion/source/sql/hana.py +1 -1
- datahub/ingestion/source/sql/hive.py +7 -5
- datahub/ingestion/source/sql/hive_metastore.py +1 -1
- datahub/ingestion/source/sql/mssql/source.py +13 -6
- datahub/ingestion/source/sql/mysql.py +1 -1
- datahub/ingestion/source/sql/oracle.py +17 -10
- datahub/ingestion/source/sql/postgres.py +2 -2
- datahub/ingestion/source/sql/presto.py +1 -1
- datahub/ingestion/source/sql/sql_config.py +8 -9
- datahub/ingestion/source/sql/sql_generic.py +1 -1
- datahub/ingestion/source/sql/teradata.py +1 -1
- datahub/ingestion/source/sql/trino.py +1 -1
- datahub/ingestion/source/sql/vertica.py +5 -4
- datahub/ingestion/source/sql_queries.py +174 -22
- datahub/ingestion/source/state/checkpoint.py +2 -2
- datahub/ingestion/source/state/entity_removal_state.py +2 -1
- datahub/ingestion/source/state/stateful_ingestion_base.py +55 -45
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +9 -9
- datahub/ingestion/source/tableau/tableau.py +14 -16
- datahub/ingestion/source/unity/azure_auth_config.py +15 -0
- datahub/ingestion/source/unity/config.py +51 -34
- datahub/ingestion/source/unity/connection.py +7 -1
- datahub/ingestion/source/unity/connection_test.py +1 -1
- datahub/ingestion/source/unity/proxy.py +216 -7
- datahub/ingestion/source/unity/proxy_types.py +91 -0
- datahub/ingestion/source/unity/source.py +29 -3
- datahub/ingestion/source/usage/clickhouse_usage.py +1 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +1 -1
- datahub/ingestion/source/usage/usage_common.py +5 -3
- datahub/ingestion/source_config/csv_enricher.py +7 -6
- datahub/ingestion/source_config/operation_config.py +7 -4
- datahub/ingestion/source_config/pulsar.py +11 -15
- datahub/ingestion/transformer/add_dataset_browse_path.py +1 -1
- datahub/ingestion/transformer/add_dataset_dataproduct.py +6 -5
- datahub/ingestion/transformer/add_dataset_ownership.py +3 -3
- datahub/ingestion/transformer/add_dataset_properties.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_tags.py +2 -2
- datahub/ingestion/transformer/add_dataset_schema_terms.py +2 -2
- datahub/ingestion/transformer/add_dataset_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +1 -1
- datahub/ingestion/transformer/extract_dataset_tags.py +1 -1
- datahub/ingestion/transformer/extract_ownership_from_tags.py +1 -1
- datahub/ingestion/transformer/mark_dataset_status.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_dataset_usage_user.py +1 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +1 -1
- datahub/ingestion/transformer/remove_dataset_ownership.py +1 -1
- datahub/ingestion/transformer/replace_external_url.py +2 -2
- datahub/ingestion/transformer/set_browse_path.py +1 -1
- datahub/ingestion/transformer/tags_to_terms.py +1 -1
- datahub/lite/duckdb_lite.py +1 -1
- datahub/lite/lite_util.py +2 -2
- datahub/metadata/_internal_schema_classes.py +62 -2
- datahub/metadata/com/linkedin/pegasus2avro/assertion/__init__.py +2 -0
- datahub/metadata/schema.avsc +271 -91
- datahub/metadata/schemas/ApplicationProperties.avsc +5 -2
- datahub/metadata/schemas/AssertionInfo.avsc +48 -5
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +8 -4
- datahub/metadata/schemas/ChartInfo.avsc +12 -5
- datahub/metadata/schemas/ContainerProperties.avsc +12 -5
- datahub/metadata/schemas/CorpGroupEditableInfo.avsc +2 -1
- datahub/metadata/schemas/CorpGroupInfo.avsc +7 -3
- datahub/metadata/schemas/CorpUserInfo.avsc +5 -2
- datahub/metadata/schemas/CorpUserSettings.avsc +4 -2
- datahub/metadata/schemas/DashboardInfo.avsc +16 -4
- datahub/metadata/schemas/DataFlowInfo.avsc +11 -5
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +4 -2
- datahub/metadata/schemas/DataJobInfo.avsc +9 -4
- datahub/metadata/schemas/DataPlatformInfo.avsc +3 -1
- datahub/metadata/schemas/DataPlatformInstanceProperties.avsc +5 -2
- datahub/metadata/schemas/DataProductProperties.avsc +5 -2
- datahub/metadata/schemas/DataTypeInfo.avsc +5 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/DatasetProperties.avsc +12 -5
- datahub/metadata/schemas/DomainProperties.avsc +7 -3
- datahub/metadata/schemas/EditableContainerProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDashboardProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataFlowProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDataJobProperties.avsc +2 -1
- datahub/metadata/schemas/EditableDatasetProperties.avsc +2 -1
- datahub/metadata/schemas/EditableERModelRelationshipProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLFeatureTableProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelGroupProperties.avsc +2 -1
- datahub/metadata/schemas/EditableMLModelProperties.avsc +2 -1
- datahub/metadata/schemas/EditableNotebookProperties.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +5 -3
- datahub/metadata/schemas/EntityTypeInfo.avsc +5 -0
- datahub/metadata/schemas/GlobalTags.avsc +3 -2
- datahub/metadata/schemas/GlossaryNodeInfo.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermInfo.avsc +3 -1
- datahub/metadata/schemas/InputFields.avsc +3 -2
- datahub/metadata/schemas/MLFeatureKey.avsc +3 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +3 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +3 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLModelProperties.avsc +4 -2
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +3 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +124 -50
- datahub/metadata/schemas/NotebookInfo.avsc +5 -2
- datahub/metadata/schemas/Ownership.avsc +3 -2
- datahub/metadata/schemas/QuerySubjects.avsc +1 -1
- datahub/metadata/schemas/RoleProperties.avsc +3 -1
- datahub/metadata/schemas/SchemaFieldInfo.avsc +3 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -2
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +15 -4
- datahub/metadata/schemas/TagProperties.avsc +3 -1
- datahub/metadata/schemas/TestInfo.avsc +2 -1
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/_all_entities.py +2 -0
- datahub/sdk/search_filters.py +68 -40
- datahub/sdk/tag.py +112 -0
- datahub/secret/datahub_secret_store.py +7 -4
- datahub/secret/file_secret_store.py +1 -1
- datahub/sql_parsing/schema_resolver.py +29 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +15 -0
- datahub/sql_parsing/sqlglot_lineage.py +5 -2
- datahub/testing/check_sql_parser_result.py +2 -2
- datahub/utilities/ingest_utils.py +1 -1
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.3.0.1rc9.dist-info → acryl_datahub-1.3.1.1.dist-info}/top_level.txt +0 -0
datahub/sdk/__init__.py
CHANGED
|
@@ -28,6 +28,7 @@ from datahub.sdk.main_client import DataHubClient
|
|
|
28
28
|
from datahub.sdk.mlmodel import MLModel
|
|
29
29
|
from datahub.sdk.mlmodelgroup import MLModelGroup
|
|
30
30
|
from datahub.sdk.search_filters import Filter, FilterDsl
|
|
31
|
+
from datahub.sdk.tag import Tag
|
|
31
32
|
|
|
32
33
|
# We want to print out the warning if people do `from datahub.sdk import X`.
|
|
33
34
|
# But we don't want to print out warnings if they're doing a more direct
|
datahub/sdk/_all_entities.py
CHANGED
|
@@ -9,6 +9,7 @@ from datahub.sdk.dataset import Dataset
|
|
|
9
9
|
from datahub.sdk.entity import Entity
|
|
10
10
|
from datahub.sdk.mlmodel import MLModel
|
|
11
11
|
from datahub.sdk.mlmodelgroup import MLModelGroup
|
|
12
|
+
from datahub.sdk.tag import Tag
|
|
12
13
|
|
|
13
14
|
# Base entity classes that don't have circular dependencies
|
|
14
15
|
# Those that do are imported in the EntityClient where needed
|
|
@@ -22,6 +23,7 @@ ENTITY_CLASSES_LIST: List[Type[Entity]] = [
|
|
|
22
23
|
DataJob,
|
|
23
24
|
Dashboard,
|
|
24
25
|
Chart,
|
|
26
|
+
Tag,
|
|
25
27
|
]
|
|
26
28
|
|
|
27
29
|
# Create the mapping of entity types to classes
|
datahub/sdk/search_filters.py
CHANGED
|
@@ -16,6 +16,7 @@ from typing import (
|
|
|
16
16
|
)
|
|
17
17
|
|
|
18
18
|
import pydantic
|
|
19
|
+
from pydantic import field_validator
|
|
19
20
|
|
|
20
21
|
from datahub.configuration.common import ConfigModel
|
|
21
22
|
from datahub.configuration.pydantic_migration_helpers import (
|
|
@@ -102,7 +103,8 @@ class _EntitySubtypeFilter(_BaseFilter):
|
|
|
102
103
|
description="The entity subtype to filter on. Can be 'Table', 'View', 'Source', etc. depending on the native platform's concepts.",
|
|
103
104
|
)
|
|
104
105
|
|
|
105
|
-
@
|
|
106
|
+
@field_validator("entity_subtype", mode="before")
|
|
107
|
+
@classmethod
|
|
106
108
|
def validate_entity_subtype(cls, v: str) -> List[str]:
|
|
107
109
|
return [v] if not isinstance(v, list) else v
|
|
108
110
|
|
|
@@ -141,10 +143,13 @@ class _PlatformFilter(_BaseFilter):
|
|
|
141
143
|
platform: List[str]
|
|
142
144
|
# TODO: Add validator to convert string -> list of strings
|
|
143
145
|
|
|
144
|
-
@
|
|
145
|
-
|
|
146
|
+
@field_validator("platform", mode="before")
|
|
147
|
+
@classmethod
|
|
148
|
+
def validate_platform(cls, v):
|
|
146
149
|
# Subtle - we use the constructor instead of the from_string method
|
|
147
150
|
# because coercion is acceptable here.
|
|
151
|
+
if isinstance(v, list):
|
|
152
|
+
return [str(DataPlatformUrn(item)) for item in v]
|
|
148
153
|
return str(DataPlatformUrn(v))
|
|
149
154
|
|
|
150
155
|
def _build_rule(self) -> SearchFilterRule:
|
|
@@ -161,8 +166,11 @@ class _PlatformFilter(_BaseFilter):
|
|
|
161
166
|
class _DomainFilter(_BaseFilter):
|
|
162
167
|
domain: List[str]
|
|
163
168
|
|
|
164
|
-
@
|
|
165
|
-
|
|
169
|
+
@field_validator("domain", mode="before")
|
|
170
|
+
@classmethod
|
|
171
|
+
def validate_domain(cls, v):
|
|
172
|
+
if isinstance(v, list):
|
|
173
|
+
return [str(DomainUrn.from_string(item)) for item in v]
|
|
166
174
|
return str(DomainUrn.from_string(v))
|
|
167
175
|
|
|
168
176
|
def _build_rule(self) -> SearchFilterRule:
|
|
@@ -183,8 +191,11 @@ class _ContainerFilter(_BaseFilter):
|
|
|
183
191
|
description="If true, only entities that are direct descendants of the container will be returned.",
|
|
184
192
|
)
|
|
185
193
|
|
|
186
|
-
@
|
|
187
|
-
|
|
194
|
+
@field_validator("container", mode="before")
|
|
195
|
+
@classmethod
|
|
196
|
+
def validate_container(cls, v):
|
|
197
|
+
if isinstance(v, list):
|
|
198
|
+
return [str(ContainerUrn.from_string(item)) for item in v]
|
|
188
199
|
return str(ContainerUrn.from_string(v))
|
|
189
200
|
|
|
190
201
|
@classmethod
|
|
@@ -249,17 +260,25 @@ class _OwnerFilter(_BaseFilter):
|
|
|
249
260
|
description="The owner to filter on. Should be user or group URNs.",
|
|
250
261
|
)
|
|
251
262
|
|
|
252
|
-
@
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
+
@field_validator("owner", mode="before")
|
|
264
|
+
@classmethod
|
|
265
|
+
def validate_owner(cls, v):
|
|
266
|
+
validated = []
|
|
267
|
+
for owner in v:
|
|
268
|
+
if not owner.startswith("urn:li:"):
|
|
269
|
+
raise ValueError(
|
|
270
|
+
f"Owner must be a valid User or Group URN, got: {owner}"
|
|
271
|
+
)
|
|
272
|
+
_type = guess_entity_type(owner)
|
|
273
|
+
if _type == CorpUserUrn.ENTITY_TYPE:
|
|
274
|
+
validated.append(str(CorpUserUrn.from_string(owner)))
|
|
275
|
+
elif _type == CorpGroupUrn.ENTITY_TYPE:
|
|
276
|
+
validated.append(str(CorpGroupUrn.from_string(owner)))
|
|
277
|
+
else:
|
|
278
|
+
raise ValueError(
|
|
279
|
+
f"Owner must be a valid User or Group URN, got: {owner}"
|
|
280
|
+
)
|
|
281
|
+
return validated
|
|
263
282
|
|
|
264
283
|
def _build_rule(self) -> SearchFilterRule:
|
|
265
284
|
return SearchFilterRule(
|
|
@@ -279,17 +298,21 @@ class _GlossaryTermFilter(_BaseFilter):
|
|
|
279
298
|
description="The glossary term to filter on. Should be glossary term URNs.",
|
|
280
299
|
)
|
|
281
300
|
|
|
282
|
-
@
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
301
|
+
@field_validator("glossary_term", mode="before")
|
|
302
|
+
@classmethod
|
|
303
|
+
def validate_glossary_term(cls, v):
|
|
304
|
+
validated = []
|
|
305
|
+
for term in v:
|
|
306
|
+
if not term.startswith("urn:li:"):
|
|
307
|
+
raise ValueError(f"Glossary term must be a valid URN, got: {term}")
|
|
308
|
+
# Validate that it's a glossary term URN
|
|
309
|
+
_type = guess_entity_type(term)
|
|
310
|
+
if _type != "glossaryTerm":
|
|
311
|
+
raise ValueError(
|
|
312
|
+
f"Glossary term must be a valid glossary term URN, got: {term}"
|
|
313
|
+
)
|
|
314
|
+
validated.append(term)
|
|
315
|
+
return validated
|
|
293
316
|
|
|
294
317
|
def _build_rule(self) -> SearchFilterRule:
|
|
295
318
|
return SearchFilterRule(
|
|
@@ -309,15 +332,19 @@ class _TagFilter(_BaseFilter):
|
|
|
309
332
|
description="The tag to filter on. Should be tag URNs.",
|
|
310
333
|
)
|
|
311
334
|
|
|
312
|
-
@
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
335
|
+
@field_validator("tag", mode="before")
|
|
336
|
+
@classmethod
|
|
337
|
+
def validate_tag(cls, v):
|
|
338
|
+
validated = []
|
|
339
|
+
for tag in v:
|
|
340
|
+
if not tag.startswith("urn:li:"):
|
|
341
|
+
raise ValueError(f"Tag must be a valid URN, got: {tag}")
|
|
342
|
+
# Validate that it's a tag URN
|
|
343
|
+
_type = guess_entity_type(tag)
|
|
344
|
+
if _type != "tag":
|
|
345
|
+
raise ValueError(f"Tag must be a valid tag URN, got: {tag}")
|
|
346
|
+
validated.append(tag)
|
|
347
|
+
return validated
|
|
321
348
|
|
|
322
349
|
def _build_rule(self) -> SearchFilterRule:
|
|
323
350
|
return SearchFilterRule(
|
|
@@ -426,7 +453,8 @@ class _Not(_BaseFilter):
|
|
|
426
453
|
|
|
427
454
|
not_: "Filter" = pydantic.Field(alias="not")
|
|
428
455
|
|
|
429
|
-
@
|
|
456
|
+
@field_validator("not_", mode="after")
|
|
457
|
+
@classmethod
|
|
430
458
|
def validate_not(cls, v: "Filter") -> "Filter":
|
|
431
459
|
inner_filter = v.compile()
|
|
432
460
|
if len(inner_filter) != 1:
|
|
@@ -571,7 +599,7 @@ def load_filters(obj: Any) -> Filter:
|
|
|
571
599
|
if PYDANTIC_VERSION_2:
|
|
572
600
|
return pydantic.TypeAdapter(Filter).validate_python(obj) # type: ignore
|
|
573
601
|
else:
|
|
574
|
-
return pydantic.
|
|
602
|
+
return pydantic.TypeAdapter(Filter).validate_python(obj) # type: ignore
|
|
575
603
|
|
|
576
604
|
|
|
577
605
|
# We need FilterDsl for two reasons:
|
datahub/sdk/tag.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Type
|
|
4
|
+
|
|
5
|
+
from typing_extensions import Self
|
|
6
|
+
|
|
7
|
+
import datahub.metadata.schema_classes as models
|
|
8
|
+
from datahub.metadata.urns import TagUrn, Urn
|
|
9
|
+
from datahub.sdk._shared import (
|
|
10
|
+
HasOwnership,
|
|
11
|
+
OwnersInputType,
|
|
12
|
+
)
|
|
13
|
+
from datahub.sdk.entity import Entity, ExtraAspectsType
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Tag(
|
|
17
|
+
HasOwnership,
|
|
18
|
+
Entity,
|
|
19
|
+
):
|
|
20
|
+
__slots__ = ()
|
|
21
|
+
|
|
22
|
+
@classmethod
|
|
23
|
+
def get_urn_type(cls) -> Type[TagUrn]:
|
|
24
|
+
return TagUrn
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
*,
|
|
29
|
+
# Identity.
|
|
30
|
+
name: str,
|
|
31
|
+
# Tag properties.
|
|
32
|
+
display_name: Optional[str] = None,
|
|
33
|
+
description: Optional[str] = None,
|
|
34
|
+
color: Optional[str] = None,
|
|
35
|
+
# Standard aspects.
|
|
36
|
+
owners: Optional[OwnersInputType] = None,
|
|
37
|
+
extra_aspects: ExtraAspectsType = None,
|
|
38
|
+
):
|
|
39
|
+
"""Initialize a new Tag instance."""
|
|
40
|
+
urn = TagUrn(name=name)
|
|
41
|
+
super().__init__(urn)
|
|
42
|
+
self._set_extra_aspects(extra_aspects)
|
|
43
|
+
|
|
44
|
+
self._ensure_tag_props(
|
|
45
|
+
display_name=display_name or name,
|
|
46
|
+
description=description,
|
|
47
|
+
color=color,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if owners is not None:
|
|
51
|
+
self.set_owners(owners)
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def _new_from_graph(cls, urn: Urn, current_aspects: models.AspectBag) -> Self:
|
|
55
|
+
assert isinstance(urn, TagUrn)
|
|
56
|
+
entity = cls(name=urn.name)
|
|
57
|
+
return entity._init_from_graph(current_aspects)
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def urn(self) -> TagUrn:
|
|
61
|
+
assert isinstance(self._urn, TagUrn)
|
|
62
|
+
return self._urn
|
|
63
|
+
|
|
64
|
+
def _ensure_tag_props(
|
|
65
|
+
self,
|
|
66
|
+
*,
|
|
67
|
+
display_name: Optional[str] = None,
|
|
68
|
+
description: Optional[str] = None,
|
|
69
|
+
color: Optional[str] = None,
|
|
70
|
+
) -> models.TagPropertiesClass:
|
|
71
|
+
existing_props = self._get_aspect(models.TagPropertiesClass)
|
|
72
|
+
if existing_props is not None:
|
|
73
|
+
if display_name is not None:
|
|
74
|
+
existing_props.name = display_name
|
|
75
|
+
if description is not None:
|
|
76
|
+
existing_props.description = description
|
|
77
|
+
if color is not None:
|
|
78
|
+
existing_props.colorHex = color
|
|
79
|
+
return existing_props
|
|
80
|
+
|
|
81
|
+
return self._setdefault_aspect(
|
|
82
|
+
models.TagPropertiesClass(
|
|
83
|
+
name=display_name or self.urn.name,
|
|
84
|
+
description=description,
|
|
85
|
+
colorHex=color,
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def name(self) -> str:
|
|
91
|
+
return self.urn.name
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def display_name(self) -> str:
|
|
95
|
+
return self._ensure_tag_props().name
|
|
96
|
+
|
|
97
|
+
def set_display_name(self, display_name: str) -> None:
|
|
98
|
+
self._ensure_tag_props(display_name=display_name)
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def description(self) -> Optional[str]:
|
|
102
|
+
return self._ensure_tag_props().description
|
|
103
|
+
|
|
104
|
+
def set_description(self, description: str) -> None:
|
|
105
|
+
self._ensure_tag_props(description=description)
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def color(self) -> Optional[str]:
|
|
109
|
+
return self._ensure_tag_props().colorHex
|
|
110
|
+
|
|
111
|
+
def set_color(self, color: str) -> None:
|
|
112
|
+
self._ensure_tag_props(color=color)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Any, Dict, List, Optional, Union
|
|
3
3
|
|
|
4
|
-
from pydantic import BaseModel,
|
|
4
|
+
from pydantic import BaseModel, field_validator
|
|
5
5
|
|
|
6
6
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
7
7
|
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
@@ -18,8 +18,11 @@ class DataHubSecretStoreConfig(BaseModel):
|
|
|
18
18
|
class Config:
|
|
19
19
|
arbitrary_types_allowed = True
|
|
20
20
|
|
|
21
|
-
@
|
|
22
|
-
|
|
21
|
+
@field_validator("graph_client", mode="after")
|
|
22
|
+
@classmethod
|
|
23
|
+
def check_graph_connection(
|
|
24
|
+
cls, v: Optional[DataHubGraph]
|
|
25
|
+
) -> Optional[DataHubGraph]:
|
|
23
26
|
if v is not None:
|
|
24
27
|
v.test_connection()
|
|
25
28
|
return v
|
|
@@ -63,7 +66,7 @@ class DataHubSecretStore(SecretStore):
|
|
|
63
66
|
|
|
64
67
|
@classmethod
|
|
65
68
|
def create(cls, config: Any) -> "DataHubSecretStore":
|
|
66
|
-
config = DataHubSecretStoreConfig.
|
|
69
|
+
config = DataHubSecretStoreConfig.model_validate(config)
|
|
67
70
|
return cls(config)
|
|
68
71
|
|
|
69
72
|
def close(self) -> None:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import contextlib
|
|
2
2
|
import pathlib
|
|
3
|
+
from dataclasses import dataclass
|
|
3
4
|
from typing import Dict, List, Optional, Protocol, Set, Tuple
|
|
4
5
|
|
|
5
6
|
from typing_extensions import TypedDict
|
|
@@ -22,6 +23,14 @@ from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_fie
|
|
|
22
23
|
SchemaInfo = Dict[str, str]
|
|
23
24
|
|
|
24
25
|
|
|
26
|
+
@dataclass
|
|
27
|
+
class SchemaResolverReport:
|
|
28
|
+
"""Report class for tracking SchemaResolver cache performance."""
|
|
29
|
+
|
|
30
|
+
num_schema_cache_hits: int = 0
|
|
31
|
+
num_schema_cache_misses: int = 0
|
|
32
|
+
|
|
33
|
+
|
|
25
34
|
class GraphQLSchemaField(TypedDict):
|
|
26
35
|
fieldPath: str
|
|
27
36
|
nativeDataType: str
|
|
@@ -53,6 +62,7 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
53
62
|
env: str = DEFAULT_ENV,
|
|
54
63
|
graph: Optional[DataHubGraph] = None,
|
|
55
64
|
_cache_filename: Optional[pathlib.Path] = None,
|
|
65
|
+
report: Optional[SchemaResolverReport] = None,
|
|
56
66
|
):
|
|
57
67
|
# Also supports platform with an urn prefix.
|
|
58
68
|
self._platform = DataPlatformUrn(platform).platform_name
|
|
@@ -60,6 +70,7 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
60
70
|
self.env = env
|
|
61
71
|
|
|
62
72
|
self.graph = graph
|
|
73
|
+
self.report = report
|
|
63
74
|
|
|
64
75
|
# Init cache, potentially restoring from a previous run.
|
|
65
76
|
shared_conn = None
|
|
@@ -132,12 +143,14 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
132
143
|
|
|
133
144
|
schema_info = self._resolve_schema_info(urn)
|
|
134
145
|
if schema_info:
|
|
146
|
+
self._track_cache_hit()
|
|
135
147
|
return urn, schema_info
|
|
136
148
|
|
|
137
149
|
urn_lower = self.get_urn_for_table(table, lower=True)
|
|
138
150
|
if urn_lower != urn:
|
|
139
151
|
schema_info = self._resolve_schema_info(urn_lower)
|
|
140
152
|
if schema_info:
|
|
153
|
+
self._track_cache_hit()
|
|
141
154
|
return urn_lower, schema_info
|
|
142
155
|
|
|
143
156
|
# Our treatment of platform instances when lowercasing urns
|
|
@@ -152,8 +165,12 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
152
165
|
if urn_mixed not in {urn, urn_lower}:
|
|
153
166
|
schema_info = self._resolve_schema_info(urn_mixed)
|
|
154
167
|
if schema_info:
|
|
168
|
+
self._track_cache_hit()
|
|
155
169
|
return urn_mixed, schema_info
|
|
156
170
|
|
|
171
|
+
# Track cache miss for the final attempt
|
|
172
|
+
self._track_cache_miss()
|
|
173
|
+
|
|
157
174
|
if self._prefers_urn_lower():
|
|
158
175
|
return urn_lower, None
|
|
159
176
|
else:
|
|
@@ -165,6 +182,16 @@ class SchemaResolver(Closeable, SchemaResolverInterface):
|
|
|
165
182
|
def has_urn(self, urn: str) -> bool:
|
|
166
183
|
return self._schema_cache.get(urn) is not None
|
|
167
184
|
|
|
185
|
+
def _track_cache_hit(self) -> None:
|
|
186
|
+
"""Track a cache hit if reporting is enabled."""
|
|
187
|
+
if self.report is not None:
|
|
188
|
+
self.report.num_schema_cache_hits += 1
|
|
189
|
+
|
|
190
|
+
def _track_cache_miss(self) -> None:
|
|
191
|
+
"""Track a cache miss if reporting is enabled."""
|
|
192
|
+
if self.report is not None:
|
|
193
|
+
self.report.num_schema_cache_misses += 1
|
|
194
|
+
|
|
168
195
|
def _resolve_schema_info(self, urn: str) -> Optional[SchemaInfo]:
|
|
169
196
|
if urn in self._schema_cache:
|
|
170
197
|
return self._schema_cache[urn]
|
|
@@ -261,6 +288,8 @@ class _SchemaResolverWithExtras(SchemaResolverInterface):
|
|
|
261
288
|
table, lower=self._base_resolver._prefers_urn_lower()
|
|
262
289
|
)
|
|
263
290
|
if urn in self._extra_schemas:
|
|
291
|
+
# Track cache hit for extra schemas
|
|
292
|
+
self._base_resolver._track_cache_hit()
|
|
264
293
|
return urn, self._extra_schemas[urn]
|
|
265
294
|
return self._base_resolver.resolve_table(table)
|
|
266
295
|
|
|
@@ -168,6 +168,12 @@ class QueryMetadata:
|
|
|
168
168
|
query_subject_urns.add(upstream)
|
|
169
169
|
if include_fields:
|
|
170
170
|
for column in sorted(self.column_usage.get(upstream, [])):
|
|
171
|
+
# Skip empty column names to avoid creating invalid URNs
|
|
172
|
+
if not column or not column.strip():
|
|
173
|
+
logger.warning(
|
|
174
|
+
f"Skipping empty upstream column name for query {self.query_id} on upstream {upstream}"
|
|
175
|
+
)
|
|
176
|
+
continue
|
|
171
177
|
query_subject_urns.add(
|
|
172
178
|
builder.make_schema_field_urn(upstream, column)
|
|
173
179
|
)
|
|
@@ -175,6 +181,15 @@ class QueryMetadata:
|
|
|
175
181
|
query_subject_urns.add(downstream_urn)
|
|
176
182
|
if include_fields:
|
|
177
183
|
for column_lineage in self.column_lineage:
|
|
184
|
+
# Skip empty downstream columns to avoid creating invalid URNs
|
|
185
|
+
if (
|
|
186
|
+
not column_lineage.downstream.column
|
|
187
|
+
or not column_lineage.downstream.column.strip()
|
|
188
|
+
):
|
|
189
|
+
logger.warning(
|
|
190
|
+
f"Skipping empty downstream column name for query {self.query_id} on downstream {downstream_urn}"
|
|
191
|
+
)
|
|
192
|
+
continue
|
|
178
193
|
query_subject_urns.add(
|
|
179
194
|
builder.make_schema_field_urn(
|
|
180
195
|
downstream_urn, column_lineage.downstream.column
|
|
@@ -28,6 +28,7 @@ import sqlglot.optimizer.optimizer
|
|
|
28
28
|
import sqlglot.optimizer.qualify
|
|
29
29
|
import sqlglot.optimizer.qualify_columns
|
|
30
30
|
import sqlglot.optimizer.unnest_subqueries
|
|
31
|
+
from pydantic import field_validator
|
|
31
32
|
|
|
32
33
|
from datahub.cli.env_utils import get_boolean_env_variable
|
|
33
34
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
@@ -141,7 +142,8 @@ class DownstreamColumnRef(_ParserBaseModel):
|
|
|
141
142
|
column_type: Optional[SchemaFieldDataTypeClass] = None
|
|
142
143
|
native_column_type: Optional[str] = None
|
|
143
144
|
|
|
144
|
-
@
|
|
145
|
+
@field_validator("column_type", mode="before")
|
|
146
|
+
@classmethod
|
|
145
147
|
def _load_column_type(
|
|
146
148
|
cls, v: Optional[Union[dict, SchemaFieldDataTypeClass]]
|
|
147
149
|
) -> Optional[SchemaFieldDataTypeClass]:
|
|
@@ -215,7 +217,8 @@ class SqlParsingDebugInfo(_ParserBaseModel):
|
|
|
215
217
|
def error(self) -> Optional[Exception]:
|
|
216
218
|
return self.table_error or self.column_error
|
|
217
219
|
|
|
218
|
-
@
|
|
220
|
+
@field_validator("table_error", "column_error", mode="before")
|
|
221
|
+
@classmethod
|
|
219
222
|
def remove_variables_from_error(cls, v: Optional[Exception]) -> Optional[Exception]:
|
|
220
223
|
if v and v.__traceback__:
|
|
221
224
|
# Remove local variables from the traceback to avoid memory leaks.
|
|
@@ -60,8 +60,8 @@ def assert_sql_result_with_resolver(
|
|
|
60
60
|
expected = SqlParsingResult.parse_raw(expected_file.read_text())
|
|
61
61
|
|
|
62
62
|
full_diff = deepdiff.DeepDiff(
|
|
63
|
-
expected.
|
|
64
|
-
res.
|
|
63
|
+
expected.model_dump(),
|
|
64
|
+
res.model_dump(),
|
|
65
65
|
exclude_regex_paths=[
|
|
66
66
|
r"root.column_lineage\[\d+\].logic",
|
|
67
67
|
],
|
|
@@ -48,7 +48,7 @@ def deploy_source_vars(
|
|
|
48
48
|
|
|
49
49
|
deploy_options_raw = pipeline_config.pop("deployment", None)
|
|
50
50
|
if deploy_options_raw is not None:
|
|
51
|
-
deploy_options = DeployOptions.
|
|
51
|
+
deploy_options = DeployOptions.model_validate(deploy_options_raw)
|
|
52
52
|
|
|
53
53
|
if name:
|
|
54
54
|
logger.info(f"Overriding deployment name {deploy_options.name} with {name}")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|