acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/entities/common/serialized_value.py +4 -3
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/check_cli.py +72 -19
- datahub/cli/docker_cli.py +3 -3
- datahub/cli/iceberg_cli.py +31 -7
- datahub/cli/ingest_cli.py +30 -93
- datahub/cli/lite_cli.py +4 -2
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/configuration/common.py +10 -2
- datahub/configuration/git.py +1 -3
- datahub/configuration/kafka.py +1 -1
- datahub/emitter/mce_builder.py +28 -13
- datahub/emitter/mcp_builder.py +4 -1
- datahub/emitter/response_helper.py +145 -0
- datahub/emitter/rest_emitter.py +323 -10
- datahub/ingestion/api/decorators.py +1 -1
- datahub/ingestion/api/source_helpers.py +4 -0
- datahub/ingestion/fs/s3_fs.py +2 -2
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +41 -22
- datahub/ingestion/graph/entity_versioning.py +3 -3
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
- datahub/ingestion/run/pipeline.py +112 -148
- datahub/ingestion/run/sink_callback.py +77 -0
- datahub/ingestion/sink/datahub_rest.py +8 -0
- datahub/ingestion/source/abs/config.py +2 -4
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
- datahub/ingestion/source/cassandra/cassandra.py +152 -233
- datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
- datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
- datahub/ingestion/source/common/subtypes.py +12 -0
- datahub/ingestion/source/csv_enricher.py +3 -3
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +8 -5
- datahub/ingestion/source/dbt/dbt_core.py +11 -9
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/delta_lake/config.py +8 -1
- datahub/ingestion/source/delta_lake/report.py +4 -2
- datahub/ingestion/source/delta_lake/source.py +20 -5
- datahub/ingestion/source/dremio/dremio_api.py +4 -8
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
- datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
- datahub/ingestion/source/elastic_search.py +26 -6
- datahub/ingestion/source/feast.py +27 -8
- datahub/ingestion/source/file.py +6 -3
- datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
- datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
- datahub/ingestion/source/ge_data_profiler.py +12 -15
- datahub/ingestion/source/iceberg/iceberg.py +46 -12
- datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
- datahub/ingestion/source/identity/okta.py +37 -7
- datahub/ingestion/source/kafka/kafka.py +1 -1
- datahub/ingestion/source/kafka_connect/common.py +2 -7
- datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
- datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
- datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
- datahub/ingestion/source/looker/looker_common.py +6 -5
- datahub/ingestion/source/looker/looker_file_loader.py +2 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
- datahub/ingestion/source/looker/looker_source.py +1 -1
- datahub/ingestion/source/looker/looker_template_language.py +4 -2
- datahub/ingestion/source/looker/lookml_source.py +3 -2
- datahub/ingestion/source/metabase.py +57 -35
- datahub/ingestion/source/metadata/business_glossary.py +45 -3
- datahub/ingestion/source/metadata/lineage.py +2 -2
- datahub/ingestion/source/mlflow.py +365 -35
- datahub/ingestion/source/mode.py +18 -8
- datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
- datahub/ingestion/source/nifi.py +37 -11
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/openapi_parser.py +49 -17
- datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
- datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -3
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
- datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/pulsar.py +3 -2
- datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
- datahub/ingestion/source/redash.py +31 -7
- datahub/ingestion/source/redshift/config.py +4 -0
- datahub/ingestion/source/redshift/datashares.py +236 -0
- datahub/ingestion/source/redshift/lineage.py +6 -2
- datahub/ingestion/source/redshift/lineage_v2.py +24 -9
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +133 -33
- datahub/ingestion/source/redshift/redshift.py +46 -73
- datahub/ingestion/source/redshift/redshift_schema.py +186 -6
- datahub/ingestion/source/redshift/report.py +3 -0
- datahub/ingestion/source/s3/config.py +5 -5
- datahub/ingestion/source/s3/source.py +20 -41
- datahub/ingestion/source/salesforce.py +550 -275
- datahub/ingestion/source/schema_inference/object.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +1 -1
- datahub/ingestion/source/slack/slack.py +31 -10
- datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
- datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
- datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
- datahub/ingestion/source/sql/athena.py +10 -16
- datahub/ingestion/source/sql/druid.py +1 -5
- datahub/ingestion/source/sql/hive.py +15 -6
- datahub/ingestion/source/sql/hive_metastore.py +3 -2
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +11 -5
- datahub/ingestion/source/sql/oracle.py +127 -63
- datahub/ingestion/source/sql/sql_common.py +16 -18
- datahub/ingestion/source/sql/sql_types.py +2 -2
- datahub/ingestion/source/sql/teradata.py +19 -5
- datahub/ingestion/source/sql/trino.py +2 -2
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +222 -62
- datahub/ingestion/source/tableau/tableau.py +22 -6
- datahub/ingestion/source/tableau/tableau_common.py +3 -2
- datahub/ingestion/source/unity/ge_profiler.py +2 -1
- datahub/ingestion/source/unity/source.py +11 -1
- datahub/ingestion/source/vertexai.py +697 -0
- datahub/ingestion/source_config/pulsar.py +3 -1
- datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
- datahub/lite/duckdb_lite.py +3 -10
- datahub/lite/lite_local.py +1 -1
- datahub/lite/lite_util.py +4 -3
- datahub/metadata/_schema_classes.py +714 -417
- datahub/metadata/_urns/urn_defs.py +1673 -1649
- datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
- datahub/metadata/schema.avsc +16438 -16603
- datahub/metadata/schemas/AssertionInfo.avsc +3 -1
- datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
- datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
- datahub/metadata/schemas/ChartInfo.avsc +1 -0
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/DomainKey.avsc +2 -1
- datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
- datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
- datahub/metadata/schemas/IncidentInfo.avsc +130 -46
- datahub/metadata/schemas/InputFields.avsc +3 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
- datahub/metadata/schemas/MLModelKey.avsc +3 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +30 -12
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_attribution.py +4 -0
- datahub/sdk/_shared.py +258 -16
- datahub/sdk/_utils.py +35 -0
- datahub/sdk/container.py +30 -6
- datahub/sdk/dataset.py +118 -20
- datahub/sdk/{_entity.py → entity.py} +24 -1
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +23 -0
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- datahub/specific/dataset.py +3 -4
- datahub/sql_parsing/_sqlglot_patch.py +2 -10
- datahub/sql_parsing/schema_resolver.py +1 -1
- datahub/sql_parsing/split_statements.py +220 -126
- datahub/sql_parsing/sql_parsing_common.py +7 -0
- datahub/sql_parsing/sqlglot_lineage.py +1 -1
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- datahub/testing/check_sql_parser_result.py +5 -6
- datahub/testing/compare_metadata_json.py +7 -6
- datahub/testing/pytest_hooks.py +56 -0
- datahub/upgrade/upgrade.py +2 -2
- datahub/utilities/file_backed_collections.py +3 -14
- datahub/utilities/ingest_utils.py +106 -0
- datahub/utilities/mapping.py +1 -1
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/sentinels.py +22 -0
- datahub/utilities/unified_diff.py +5 -1
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
- {acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import abc
|
|
4
|
+
from typing import (
|
|
5
|
+
Any,
|
|
6
|
+
List,
|
|
7
|
+
Sequence,
|
|
8
|
+
TypedDict,
|
|
9
|
+
Union,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
import pydantic
|
|
13
|
+
|
|
14
|
+
from datahub.configuration.common import ConfigModel
|
|
15
|
+
from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
|
|
16
|
+
from datahub.ingestion.graph.client import entity_type_to_graphql
|
|
17
|
+
from datahub.ingestion.graph.filters import SearchFilterRule
|
|
18
|
+
from datahub.metadata.schema_classes import EntityTypeName
|
|
19
|
+
from datahub.metadata.urns import DataPlatformUrn, DomainUrn
|
|
20
|
+
|
|
21
|
+
_AndSearchFilterRule = TypedDict(
|
|
22
|
+
"_AndSearchFilterRule", {"and": List[SearchFilterRule]}
|
|
23
|
+
)
|
|
24
|
+
_OrFilters = List[_AndSearchFilterRule]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class _BaseFilter(ConfigModel):
|
|
28
|
+
class Config:
|
|
29
|
+
# We can't wrap this in a TYPE_CHECKING block because the pydantic plugin
|
|
30
|
+
# doesn't recognize it properly. So unfortunately we'll need to live
|
|
31
|
+
# with the deprecation warning w/ pydantic v2.
|
|
32
|
+
allow_population_by_field_name = True
|
|
33
|
+
if PYDANTIC_VERSION_2:
|
|
34
|
+
populate_by_name = True
|
|
35
|
+
|
|
36
|
+
@abc.abstractmethod
|
|
37
|
+
def compile(self) -> _OrFilters:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _flexible_entity_type_to_graphql(entity_type: str) -> str:
|
|
42
|
+
if entity_type.upper() == entity_type:
|
|
43
|
+
# Assume that we were passed a graphql EntityType enum value,
|
|
44
|
+
# so no conversion is needed.
|
|
45
|
+
return entity_type
|
|
46
|
+
return entity_type_to_graphql(entity_type)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class _EntityTypeFilter(_BaseFilter):
|
|
50
|
+
entity_type: List[str] = pydantic.Field(
|
|
51
|
+
description="The entity type to filter on. Can be 'dataset', 'chart', 'dashboard', 'corpuser', etc.",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def _build_rule(self) -> SearchFilterRule:
|
|
55
|
+
return SearchFilterRule(
|
|
56
|
+
field="_entityType",
|
|
57
|
+
condition="EQUAL",
|
|
58
|
+
values=[_flexible_entity_type_to_graphql(t) for t in self.entity_type],
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def compile(self) -> _OrFilters:
|
|
62
|
+
return [{"and": [self._build_rule()]}]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class _EntitySubtypeFilter(_BaseFilter):
|
|
66
|
+
entity_type: str
|
|
67
|
+
entity_subtype: str = pydantic.Field(
|
|
68
|
+
description="The entity subtype to filter on. Can be 'Table', 'View', 'Source', etc. depending on the native platform's concepts.",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def compile(self) -> _OrFilters:
|
|
72
|
+
rules = [
|
|
73
|
+
SearchFilterRule(
|
|
74
|
+
field="_entityType",
|
|
75
|
+
condition="EQUAL",
|
|
76
|
+
values=[_flexible_entity_type_to_graphql(self.entity_type)],
|
|
77
|
+
),
|
|
78
|
+
SearchFilterRule(
|
|
79
|
+
field="typeNames",
|
|
80
|
+
condition="EQUAL",
|
|
81
|
+
values=[self.entity_subtype],
|
|
82
|
+
),
|
|
83
|
+
]
|
|
84
|
+
return [{"and": rules}]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class _PlatformFilter(_BaseFilter):
|
|
88
|
+
platform: List[str]
|
|
89
|
+
# TODO: Add validator to convert string -> list of strings
|
|
90
|
+
|
|
91
|
+
@pydantic.validator("platform", each_item=True)
|
|
92
|
+
def validate_platform(cls, v: str) -> str:
|
|
93
|
+
# Subtle - we use the constructor instead of the from_string method
|
|
94
|
+
# because coercion is acceptable here.
|
|
95
|
+
return str(DataPlatformUrn(v))
|
|
96
|
+
|
|
97
|
+
def _build_rule(self) -> SearchFilterRule:
|
|
98
|
+
return SearchFilterRule(
|
|
99
|
+
field="platform.keyword",
|
|
100
|
+
condition="EQUAL",
|
|
101
|
+
values=self.platform,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def compile(self) -> _OrFilters:
|
|
105
|
+
return [{"and": [self._build_rule()]}]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class _DomainFilter(_BaseFilter):
|
|
109
|
+
domain: List[str]
|
|
110
|
+
|
|
111
|
+
@pydantic.validator("domain", each_item=True)
|
|
112
|
+
def validate_domain(cls, v: str) -> str:
|
|
113
|
+
return str(DomainUrn.from_string(v))
|
|
114
|
+
|
|
115
|
+
def _build_rule(self) -> SearchFilterRule:
|
|
116
|
+
return SearchFilterRule(
|
|
117
|
+
field="domains",
|
|
118
|
+
condition="EQUAL",
|
|
119
|
+
values=self.domain,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def compile(self) -> _OrFilters:
|
|
123
|
+
return [{"and": [self._build_rule()]}]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class _EnvFilter(_BaseFilter):
|
|
127
|
+
# Note that not all entity types have an env (e.g. dashboards / charts).
|
|
128
|
+
# If the env filter is specified, these will be excluded.
|
|
129
|
+
env: List[str]
|
|
130
|
+
|
|
131
|
+
def compile(self) -> _OrFilters:
|
|
132
|
+
return [
|
|
133
|
+
# For most entity types, we look at the origin field.
|
|
134
|
+
{
|
|
135
|
+
"and": [
|
|
136
|
+
SearchFilterRule(
|
|
137
|
+
field="origin",
|
|
138
|
+
condition="EQUAL",
|
|
139
|
+
values=self.env,
|
|
140
|
+
),
|
|
141
|
+
]
|
|
142
|
+
},
|
|
143
|
+
# For containers, we now have an "env" property as of
|
|
144
|
+
# https://github.com/datahub-project/datahub/pull/11214
|
|
145
|
+
# Prior to this, we put "env" in the customProperties. But we're
|
|
146
|
+
# not bothering with that here.
|
|
147
|
+
{
|
|
148
|
+
"and": [
|
|
149
|
+
SearchFilterRule(
|
|
150
|
+
field="env",
|
|
151
|
+
condition="EQUAL",
|
|
152
|
+
values=self.env,
|
|
153
|
+
),
|
|
154
|
+
]
|
|
155
|
+
},
|
|
156
|
+
]
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class _CustomCondition(_BaseFilter):
|
|
160
|
+
"""Represents a single field condition"""
|
|
161
|
+
|
|
162
|
+
field: str
|
|
163
|
+
condition: str
|
|
164
|
+
values: List[str]
|
|
165
|
+
|
|
166
|
+
def compile(self) -> _OrFilters:
|
|
167
|
+
rule = SearchFilterRule(
|
|
168
|
+
field=self.field,
|
|
169
|
+
condition=self.condition,
|
|
170
|
+
values=self.values,
|
|
171
|
+
)
|
|
172
|
+
return [{"and": [rule]}]
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
class _And(_BaseFilter):
|
|
176
|
+
"""Represents an AND conjunction of filters"""
|
|
177
|
+
|
|
178
|
+
and_: Sequence["Filter"] = pydantic.Field(alias="and")
|
|
179
|
+
# TODO: Add validator to ensure that the "and" field is not empty
|
|
180
|
+
|
|
181
|
+
def compile(self) -> _OrFilters:
|
|
182
|
+
# The "and" operator must be implemented by doing a Cartesian product
|
|
183
|
+
# of the OR clauses.
|
|
184
|
+
# Example 1:
|
|
185
|
+
# (A or B) and (C or D) ->
|
|
186
|
+
# (A and C) or (A and D) or (B and C) or (B and D)
|
|
187
|
+
# Example 2:
|
|
188
|
+
# (A or B) and (C or D) and (E or F) ->
|
|
189
|
+
# (A and C and E) or (A and C and F) or (A and D and E) or (A and D and F) or
|
|
190
|
+
# (B and C and E) or (B and C and F) or (B and D and E) or (B and D and F)
|
|
191
|
+
|
|
192
|
+
# Start with the first filter's OR clauses
|
|
193
|
+
result = self.and_[0].compile()
|
|
194
|
+
|
|
195
|
+
# For each subsequent filter
|
|
196
|
+
for filter in self.and_[1:]:
|
|
197
|
+
new_result = []
|
|
198
|
+
# Get its OR clauses
|
|
199
|
+
other_clauses = filter.compile()
|
|
200
|
+
|
|
201
|
+
# Create Cartesian product
|
|
202
|
+
for existing_clause in result:
|
|
203
|
+
for other_clause in other_clauses:
|
|
204
|
+
# Merge the AND conditions from both clauses
|
|
205
|
+
new_result.append(self._merge_ands(existing_clause, other_clause))
|
|
206
|
+
|
|
207
|
+
result = new_result
|
|
208
|
+
|
|
209
|
+
return result
|
|
210
|
+
|
|
211
|
+
@classmethod
|
|
212
|
+
def _merge_ands(
|
|
213
|
+
cls, a: _AndSearchFilterRule, b: _AndSearchFilterRule
|
|
214
|
+
) -> _AndSearchFilterRule:
|
|
215
|
+
return {
|
|
216
|
+
"and": [
|
|
217
|
+
*a["and"],
|
|
218
|
+
*b["and"],
|
|
219
|
+
]
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class _Or(_BaseFilter):
|
|
224
|
+
"""Represents an OR conjunction of filters"""
|
|
225
|
+
|
|
226
|
+
or_: Sequence["Filter"] = pydantic.Field(alias="or")
|
|
227
|
+
# TODO: Add validator to ensure that the "or" field is not empty
|
|
228
|
+
|
|
229
|
+
def compile(self) -> _OrFilters:
|
|
230
|
+
merged_filter = []
|
|
231
|
+
for filter in self.or_:
|
|
232
|
+
merged_filter.extend(filter.compile())
|
|
233
|
+
return merged_filter
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class _Not(_BaseFilter):
|
|
237
|
+
"""Represents a NOT filter"""
|
|
238
|
+
|
|
239
|
+
not_: "Filter" = pydantic.Field(alias="not")
|
|
240
|
+
|
|
241
|
+
@pydantic.validator("not_", pre=False)
|
|
242
|
+
def validate_not(cls, v: "Filter") -> "Filter":
|
|
243
|
+
inner_filter = v.compile()
|
|
244
|
+
if len(inner_filter) != 1:
|
|
245
|
+
raise ValueError(
|
|
246
|
+
"Cannot negate a filter with multiple OR clauses [not yet supported]"
|
|
247
|
+
)
|
|
248
|
+
return v
|
|
249
|
+
|
|
250
|
+
def compile(self) -> _OrFilters:
|
|
251
|
+
# TODO: Eventually we'll want to implement a full DNF normalizer.
|
|
252
|
+
# https://en.wikipedia.org/wiki/Disjunctive_normal_form#Conversion_to_DNF
|
|
253
|
+
|
|
254
|
+
inner_filter = self.not_.compile()
|
|
255
|
+
assert len(inner_filter) == 1 # validated above
|
|
256
|
+
|
|
257
|
+
# ¬(A and B) -> (¬A) OR (¬B)
|
|
258
|
+
and_filters = inner_filter[0]["and"]
|
|
259
|
+
final_filters: _OrFilters = []
|
|
260
|
+
for rule in and_filters:
|
|
261
|
+
final_filters.append({"and": [rule.negate()]})
|
|
262
|
+
|
|
263
|
+
return final_filters
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
# TODO: With pydantic 2, we can use a RootModel with a
|
|
267
|
+
# discriminated union to make the error messages more informative.
|
|
268
|
+
Filter = Union[
|
|
269
|
+
_And,
|
|
270
|
+
_Or,
|
|
271
|
+
_Not,
|
|
272
|
+
_EntityTypeFilter,
|
|
273
|
+
_EntitySubtypeFilter,
|
|
274
|
+
_PlatformFilter,
|
|
275
|
+
_DomainFilter,
|
|
276
|
+
_EnvFilter,
|
|
277
|
+
_CustomCondition,
|
|
278
|
+
]
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# Required to resolve forward references to "Filter"
|
|
282
|
+
if PYDANTIC_VERSION_2:
|
|
283
|
+
_And.model_rebuild() # type: ignore
|
|
284
|
+
_Or.model_rebuild() # type: ignore
|
|
285
|
+
_Not.model_rebuild() # type: ignore
|
|
286
|
+
else:
|
|
287
|
+
_And.update_forward_refs()
|
|
288
|
+
_Or.update_forward_refs()
|
|
289
|
+
_Not.update_forward_refs()
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def load_filters(obj: Any) -> Filter:
|
|
293
|
+
if PYDANTIC_VERSION_2:
|
|
294
|
+
return pydantic.TypeAdapter(Filter).validate_python(obj) # type: ignore
|
|
295
|
+
else:
|
|
296
|
+
return pydantic.parse_obj_as(Filter, obj) # type: ignore
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
# We need FilterDsl for two reasons:
|
|
300
|
+
# 1. To provide wrapper methods around lots of filters while avoid bloating the
|
|
301
|
+
# yaml spec.
|
|
302
|
+
# 2. Pydantic models in general don't support positional arguments, making the
|
|
303
|
+
# calls feel repetitive (e.g. Platform(platform=...)).
|
|
304
|
+
# See https://github.com/pydantic/pydantic/issues/6792
|
|
305
|
+
# We also considered using dataclasses / pydantic dataclasses, but
|
|
306
|
+
# ultimately decided that they didn't quite suit our requirements,
|
|
307
|
+
# particularly with regards to the field aliases for and/or/not.
|
|
308
|
+
class FilterDsl:
|
|
309
|
+
@staticmethod
|
|
310
|
+
def and_(*args: "Filter") -> _And:
|
|
311
|
+
return _And(and_=list(args))
|
|
312
|
+
|
|
313
|
+
@staticmethod
|
|
314
|
+
def or_(*args: "Filter") -> _Or:
|
|
315
|
+
return _Or(or_=list(args))
|
|
316
|
+
|
|
317
|
+
@staticmethod
|
|
318
|
+
def not_(arg: "Filter") -> _Not:
|
|
319
|
+
return _Not(not_=arg)
|
|
320
|
+
|
|
321
|
+
@staticmethod
|
|
322
|
+
def entity_type(
|
|
323
|
+
entity_type: Union[EntityTypeName, Sequence[EntityTypeName]],
|
|
324
|
+
) -> _EntityTypeFilter:
|
|
325
|
+
return _EntityTypeFilter(
|
|
326
|
+
entity_type=(
|
|
327
|
+
[entity_type] if isinstance(entity_type, str) else list(entity_type)
|
|
328
|
+
)
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
@staticmethod
|
|
332
|
+
def entity_subtype(entity_type: str, subtype: str) -> _EntitySubtypeFilter:
|
|
333
|
+
return _EntitySubtypeFilter(
|
|
334
|
+
entity_type=entity_type,
|
|
335
|
+
entity_subtype=subtype,
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
@staticmethod
|
|
339
|
+
def platform(platform: Union[str, List[str]], /) -> _PlatformFilter:
|
|
340
|
+
return _PlatformFilter(
|
|
341
|
+
platform=[platform] if isinstance(platform, str) else platform
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# TODO: Add a platform_instance filter
|
|
345
|
+
|
|
346
|
+
@staticmethod
|
|
347
|
+
def domain(domain: Union[str, List[str]], /) -> _DomainFilter:
|
|
348
|
+
return _DomainFilter(domain=[domain] if isinstance(domain, str) else domain)
|
|
349
|
+
|
|
350
|
+
@staticmethod
|
|
351
|
+
def env(env: Union[str, List[str]], /) -> _EnvFilter:
|
|
352
|
+
return _EnvFilter(env=[env] if isinstance(env, str) else env)
|
|
353
|
+
|
|
354
|
+
@staticmethod
|
|
355
|
+
def has_custom_property(key: str, value: str) -> _CustomCondition:
|
|
356
|
+
return _CustomCondition(
|
|
357
|
+
field="customProperties",
|
|
358
|
+
condition="EQUAL",
|
|
359
|
+
values=[f"{key}={value}"],
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# TODO: Add a soft-deletion status filter
|
|
363
|
+
# TODO: add a container / browse path filter
|
|
364
|
+
# TODO add shortcut for custom filters
|
|
365
|
+
|
|
366
|
+
@staticmethod
|
|
367
|
+
def custom_filter(
|
|
368
|
+
field: str, condition: str, values: List[str]
|
|
369
|
+
) -> _CustomCondition:
|
|
370
|
+
return _CustomCondition(
|
|
371
|
+
field=field,
|
|
372
|
+
condition=condition,
|
|
373
|
+
values=values,
|
|
374
|
+
)
|
datahub/specific/dataset.py
CHANGED
|
@@ -15,6 +15,7 @@ from datahub.metadata.schema_classes import (
|
|
|
15
15
|
UpstreamClass as Upstream,
|
|
16
16
|
UpstreamLineageClass as UpstreamLineage,
|
|
17
17
|
)
|
|
18
|
+
from datahub.metadata.urns import DatasetUrn, TagUrn, Urn
|
|
18
19
|
from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
|
|
19
20
|
from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
|
|
20
21
|
from datahub.specific.aspect_helpers.structured_properties import (
|
|
@@ -22,8 +23,6 @@ from datahub.specific.aspect_helpers.structured_properties import (
|
|
|
22
23
|
)
|
|
23
24
|
from datahub.specific.aspect_helpers.tags import HasTagsPatch
|
|
24
25
|
from datahub.specific.aspect_helpers.terms import HasTermsPatch
|
|
25
|
-
from datahub.utilities.urns.tag_urn import TagUrn
|
|
26
|
-
from datahub.utilities.urns.urn import Urn
|
|
27
26
|
|
|
28
27
|
_Parent = TypeVar("_Parent", bound=MetadataPatchProposal)
|
|
29
28
|
|
|
@@ -104,12 +103,12 @@ class DatasetPatchBuilder(
|
|
|
104
103
|
):
|
|
105
104
|
def __init__(
|
|
106
105
|
self,
|
|
107
|
-
urn: str,
|
|
106
|
+
urn: Union[str, DatasetUrn],
|
|
108
107
|
system_metadata: Optional[SystemMetadataClass] = None,
|
|
109
108
|
audit_header: Optional[KafkaAuditHeaderClass] = None,
|
|
110
109
|
) -> None:
|
|
111
110
|
super().__init__(
|
|
112
|
-
urn, system_metadata=system_metadata, audit_header=audit_header
|
|
111
|
+
str(urn), system_metadata=system_metadata, audit_header=audit_header
|
|
113
112
|
)
|
|
114
113
|
|
|
115
114
|
@classmethod
|
|
@@ -172,17 +172,9 @@ def _patch_lineage() -> None:
|
|
|
172
172
|
derived_tables = [
|
|
173
173
|
source.expression.parent
|
|
174
174
|
for source in scope.sources.values()
|
|
175
|
-
@@ -254,6 +257,7 @@ def to_node(
|
|
176
|
-
if dt.comments and dt.comments[0].startswith("source: ")
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
+ c: exp.Column
|
|
180
|
-
for c in source_columns:
|
|
181
|
-
table = c.table
|
|
182
|
-
source = scope.sources.get(table)
|
|
183
175
|
@@ -281,8 +285,21 @@ def to_node(
|
|
184
|
-
#
|
|
185
|
-
#
|
|
176
|
+
# is unknown. This can happen if the definition of a source used in a query is not
|
|
177
|
+
# passed into the `sources` map.
|
|
186
178
|
source = source or exp.Placeholder()
|
|
187
179
|
+
|
|
188
180
|
+ subfields = []
|
|
@@ -13,7 +13,7 @@ from datahub.ingestion.graph.client import DataHubGraph
|
|
|
13
13
|
from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
|
|
14
14
|
from datahub.metadata.schema_classes import SchemaFieldClass, SchemaMetadataClass
|
|
15
15
|
from datahub.metadata.urns import DataPlatformUrn
|
|
16
|
-
from datahub.sql_parsing._models import _TableName as _TableName
|
|
16
|
+
from datahub.sql_parsing._models import _TableName as _TableName
|
|
17
17
|
from datahub.sql_parsing.sql_parsing_common import PLATFORMS_WITH_CASE_SENSITIVE_TABLES
|
|
18
18
|
from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict
|
|
19
19
|
from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
|