acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/cli/specific/dataset_cli.py +26 -10
- datahub/emitter/mce_builder.py +1 -3
- datahub/emitter/mcp_builder.py +8 -0
- datahub/emitter/request_helper.py +19 -14
- datahub/emitter/response_helper.py +25 -18
- datahub/emitter/rest_emitter.py +23 -7
- datahub/errors.py +8 -0
- datahub/ingestion/api/source.py +7 -2
- datahub/ingestion/api/source_helpers.py +14 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/graph/client.py +26 -20
- datahub/ingestion/graph/filters.py +62 -17
- datahub/ingestion/sink/datahub_rest.py +2 -2
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +17 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +193 -140
- datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
- datahub/ingestion/source/mlflow.py +217 -8
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/config.py +31 -4
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +41 -24
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
- datahub/ingestion/source/redshift/lineage_v2.py +9 -1
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/sigma/config.py +3 -4
- datahub/ingestion/source/sigma/sigma.py +10 -6
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/oracle.py +51 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +291 -35
- datahub/ingestion/source/usage/usage_common.py +0 -65
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +313 -2
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +24 -1
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +94 -37
- datahub/sql_parsing/split_statements.py +17 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/testing/mcp_diff.py +1 -18
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/ingestion/source/vertexai.py +0 -697
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
datahub/sdk/search_client.py
CHANGED
|
@@ -2,36 +2,106 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from typing import (
|
|
4
4
|
TYPE_CHECKING,
|
|
5
|
-
Dict,
|
|
6
5
|
Iterable,
|
|
7
6
|
List,
|
|
8
7
|
Optional,
|
|
8
|
+
Tuple,
|
|
9
|
+
Type,
|
|
10
|
+
TypeVar,
|
|
9
11
|
)
|
|
10
12
|
|
|
11
|
-
from datahub.ingestion.graph.filters import
|
|
13
|
+
from datahub.ingestion.graph.filters import RawSearchFilter, RemovedStatusFilter
|
|
12
14
|
from datahub.metadata.urns import Urn
|
|
13
|
-
from datahub.sdk.search_filters import
|
|
15
|
+
from datahub.sdk.search_filters import (
|
|
16
|
+
Filter,
|
|
17
|
+
FilterDsl,
|
|
18
|
+
_EntityTypeFilter,
|
|
19
|
+
_OrFilters,
|
|
20
|
+
_StatusFilter,
|
|
21
|
+
)
|
|
14
22
|
|
|
15
23
|
if TYPE_CHECKING:
|
|
16
24
|
from datahub.sdk.main_client import DataHubClient
|
|
17
25
|
|
|
18
26
|
|
|
27
|
+
_FilterType = TypeVar("_FilterType", bound=Filter)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _typed_dfs(
|
|
31
|
+
filter: Optional[_FilterType], type: Type[_FilterType]
|
|
32
|
+
) -> Optional[List[_FilterType]]:
|
|
33
|
+
if filter is None:
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
found: Optional[List[_FilterType]] = None
|
|
37
|
+
for f in filter.dfs():
|
|
38
|
+
if isinstance(f, type):
|
|
39
|
+
if found is None:
|
|
40
|
+
found = []
|
|
41
|
+
found.append(f)
|
|
42
|
+
return found
|
|
43
|
+
|
|
44
|
+
|
|
19
45
|
def compile_filters(
|
|
20
46
|
filter: Optional[Filter],
|
|
21
|
-
) -> Optional[List[
|
|
47
|
+
) -> Tuple[Optional[List[str]], RawSearchFilter]:
|
|
22
48
|
# TODO: Not every filter type is supported for every entity type.
|
|
23
49
|
# If we can detect issues with the filters at compile time, we should
|
|
24
50
|
# raise an error.
|
|
25
51
|
|
|
26
|
-
|
|
27
|
-
|
|
52
|
+
existing_soft_deleted_filter = _typed_dfs(filter, _StatusFilter)
|
|
53
|
+
if existing_soft_deleted_filter is None:
|
|
54
|
+
soft_deleted_filter = FilterDsl.soft_deleted(
|
|
55
|
+
RemovedStatusFilter.NOT_SOFT_DELETED
|
|
56
|
+
)
|
|
57
|
+
if filter is None:
|
|
58
|
+
filter = soft_deleted_filter
|
|
59
|
+
else:
|
|
60
|
+
filter = FilterDsl.and_(filter, soft_deleted_filter)
|
|
61
|
+
|
|
62
|
+
# This should be safe - if filter were None coming in, then we would replace it
|
|
63
|
+
# with the soft-deleted filter.
|
|
64
|
+
assert filter is not None
|
|
28
65
|
|
|
29
66
|
initial_filters = filter.compile()
|
|
30
|
-
|
|
67
|
+
|
|
68
|
+
compiled_filters: RawSearchFilter = [
|
|
31
69
|
{"and": [rule.to_raw() for rule in andClause["and"]]}
|
|
32
70
|
for andClause in initial_filters
|
|
33
71
|
]
|
|
34
72
|
|
|
73
|
+
entity_types = compute_entity_types(initial_filters)
|
|
74
|
+
|
|
75
|
+
return entity_types, compiled_filters
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def compute_entity_types(
|
|
79
|
+
filters: _OrFilters,
|
|
80
|
+
) -> Optional[List[str]]:
|
|
81
|
+
found_filters = False
|
|
82
|
+
found_positive_filters = False
|
|
83
|
+
entity_types: List[str] = []
|
|
84
|
+
for ands in filters:
|
|
85
|
+
for clause in ands["and"]:
|
|
86
|
+
if clause.field == _EntityTypeFilter.ENTITY_TYPE_FIELD:
|
|
87
|
+
found_filters = True
|
|
88
|
+
if not clause.negated:
|
|
89
|
+
found_positive_filters = True
|
|
90
|
+
|
|
91
|
+
entity_types.extend(clause.values)
|
|
92
|
+
|
|
93
|
+
if not found_filters:
|
|
94
|
+
# If we didn't find any filters, use None so we use the default set.
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
if not found_positive_filters:
|
|
98
|
+
# If we only found negated filters, then it's probably a query like
|
|
99
|
+
# "find me all entities except for dashboards". In that case, we
|
|
100
|
+
# still want to use the default set.
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
return entity_types
|
|
104
|
+
|
|
35
105
|
|
|
36
106
|
class SearchClient:
|
|
37
107
|
def __init__(self, client: DataHubClient):
|
|
@@ -43,8 +113,11 @@ class SearchClient:
|
|
|
43
113
|
filter: Optional[Filter] = None,
|
|
44
114
|
) -> Iterable[Urn]:
|
|
45
115
|
# TODO: Add better limit / pagination support.
|
|
116
|
+
types, compiled_filters = compile_filters(filter)
|
|
46
117
|
for urn in self._client._graph.get_urns_by_filter(
|
|
47
118
|
query=query,
|
|
48
|
-
|
|
119
|
+
status=None,
|
|
120
|
+
extra_or_filters=compiled_filters,
|
|
121
|
+
entity_types=types,
|
|
49
122
|
):
|
|
50
123
|
yield Urn.from_string(urn)
|
datahub/sdk/search_filters.py
CHANGED
|
@@ -3,7 +3,10 @@ from __future__ import annotations
|
|
|
3
3
|
import abc
|
|
4
4
|
from typing import (
|
|
5
5
|
Any,
|
|
6
|
+
ClassVar,
|
|
7
|
+
Iterator,
|
|
6
8
|
List,
|
|
9
|
+
Optional,
|
|
7
10
|
Sequence,
|
|
8
11
|
TypedDict,
|
|
9
12
|
Union,
|
|
@@ -13,8 +16,13 @@ import pydantic
|
|
|
13
16
|
|
|
14
17
|
from datahub.configuration.common import ConfigModel
|
|
15
18
|
from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
|
|
16
|
-
from datahub.ingestion.graph.client import
|
|
17
|
-
from datahub.ingestion.graph.filters import
|
|
19
|
+
from datahub.ingestion.graph.client import flexible_entity_type_to_graphql
|
|
20
|
+
from datahub.ingestion.graph.filters import (
|
|
21
|
+
FilterOperator,
|
|
22
|
+
RemovedStatusFilter,
|
|
23
|
+
SearchFilterRule,
|
|
24
|
+
_get_status_filter,
|
|
25
|
+
)
|
|
18
26
|
from datahub.metadata.schema_classes import EntityTypeName
|
|
19
27
|
from datahub.metadata.urns import DataPlatformUrn, DomainUrn
|
|
20
28
|
|
|
@@ -37,25 +45,28 @@ class _BaseFilter(ConfigModel):
|
|
|
37
45
|
def compile(self) -> _OrFilters:
|
|
38
46
|
pass
|
|
39
47
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
if entity_type.upper() == entity_type:
|
|
43
|
-
# Assume that we were passed a graphql EntityType enum value,
|
|
44
|
-
# so no conversion is needed.
|
|
45
|
-
return entity_type
|
|
46
|
-
return entity_type_to_graphql(entity_type)
|
|
48
|
+
def dfs(self) -> Iterator[_BaseFilter]:
|
|
49
|
+
yield self
|
|
47
50
|
|
|
48
51
|
|
|
49
52
|
class _EntityTypeFilter(_BaseFilter):
|
|
53
|
+
"""Filter for specific entity types.
|
|
54
|
+
|
|
55
|
+
If no entity type filter is specified, we will search all entity types in the
|
|
56
|
+
default search set, mirroring the behavior of the DataHub UI.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
ENTITY_TYPE_FIELD: ClassVar[str] = "_entityType"
|
|
60
|
+
|
|
50
61
|
entity_type: List[str] = pydantic.Field(
|
|
51
62
|
description="The entity type to filter on. Can be 'dataset', 'chart', 'dashboard', 'corpuser', etc.",
|
|
52
63
|
)
|
|
53
64
|
|
|
54
65
|
def _build_rule(self) -> SearchFilterRule:
|
|
55
66
|
return SearchFilterRule(
|
|
56
|
-
field=
|
|
67
|
+
field=self.ENTITY_TYPE_FIELD,
|
|
57
68
|
condition="EQUAL",
|
|
58
|
-
values=[
|
|
69
|
+
values=[flexible_entity_type_to_graphql(t) for t in self.entity_type],
|
|
59
70
|
)
|
|
60
71
|
|
|
61
72
|
def compile(self) -> _OrFilters:
|
|
@@ -63,25 +74,39 @@ class _EntityTypeFilter(_BaseFilter):
|
|
|
63
74
|
|
|
64
75
|
|
|
65
76
|
class _EntitySubtypeFilter(_BaseFilter):
|
|
66
|
-
entity_type: str
|
|
67
77
|
entity_subtype: str = pydantic.Field(
|
|
68
78
|
description="The entity subtype to filter on. Can be 'Table', 'View', 'Source', etc. depending on the native platform's concepts.",
|
|
69
79
|
)
|
|
70
80
|
|
|
81
|
+
def _build_rule(self) -> SearchFilterRule:
|
|
82
|
+
return SearchFilterRule(
|
|
83
|
+
field="typeNames",
|
|
84
|
+
condition="EQUAL",
|
|
85
|
+
values=[self.entity_subtype],
|
|
86
|
+
)
|
|
87
|
+
|
|
71
88
|
def compile(self) -> _OrFilters:
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
return
|
|
89
|
+
return [{"and": [self._build_rule()]}]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class _StatusFilter(_BaseFilter):
|
|
93
|
+
"""Filter for the status of entities during search.
|
|
94
|
+
|
|
95
|
+
If not explicitly specified, the NOT_SOFT_DELETED status filter will be applied.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
status: RemovedStatusFilter
|
|
99
|
+
|
|
100
|
+
def _build_rule(self) -> Optional[SearchFilterRule]:
|
|
101
|
+
return _get_status_filter(self.status)
|
|
102
|
+
|
|
103
|
+
def compile(self) -> _OrFilters:
|
|
104
|
+
rule = self._build_rule()
|
|
105
|
+
if rule:
|
|
106
|
+
return [{"and": [rule]}]
|
|
107
|
+
else:
|
|
108
|
+
# Our boolean algebra logic requires something here - returning [] would cause errors.
|
|
109
|
+
return FilterDsl.true().compile()
|
|
85
110
|
|
|
86
111
|
|
|
87
112
|
class _PlatformFilter(_BaseFilter):
|
|
@@ -157,10 +182,10 @@ class _EnvFilter(_BaseFilter):
|
|
|
157
182
|
|
|
158
183
|
|
|
159
184
|
class _CustomCondition(_BaseFilter):
|
|
160
|
-
"""Represents a single field condition"""
|
|
185
|
+
"""Represents a single field condition."""
|
|
161
186
|
|
|
162
187
|
field: str
|
|
163
|
-
condition:
|
|
188
|
+
condition: FilterOperator
|
|
164
189
|
values: List[str]
|
|
165
190
|
|
|
166
191
|
def compile(self) -> _OrFilters:
|
|
@@ -173,7 +198,7 @@ class _CustomCondition(_BaseFilter):
|
|
|
173
198
|
|
|
174
199
|
|
|
175
200
|
class _And(_BaseFilter):
|
|
176
|
-
"""Represents an AND conjunction of filters"""
|
|
201
|
+
"""Represents an AND conjunction of filters."""
|
|
177
202
|
|
|
178
203
|
and_: Sequence["Filter"] = pydantic.Field(alias="and")
|
|
179
204
|
# TODO: Add validator to ensure that the "and" field is not empty
|
|
@@ -219,9 +244,14 @@ class _And(_BaseFilter):
|
|
|
219
244
|
]
|
|
220
245
|
}
|
|
221
246
|
|
|
247
|
+
def dfs(self) -> Iterator[_BaseFilter]:
|
|
248
|
+
yield self
|
|
249
|
+
for filter in self.and_:
|
|
250
|
+
yield from filter.dfs()
|
|
251
|
+
|
|
222
252
|
|
|
223
253
|
class _Or(_BaseFilter):
|
|
224
|
-
"""Represents an OR conjunction of filters"""
|
|
254
|
+
"""Represents an OR conjunction of filters."""
|
|
225
255
|
|
|
226
256
|
or_: Sequence["Filter"] = pydantic.Field(alias="or")
|
|
227
257
|
# TODO: Add validator to ensure that the "or" field is not empty
|
|
@@ -232,9 +262,14 @@ class _Or(_BaseFilter):
|
|
|
232
262
|
merged_filter.extend(filter.compile())
|
|
233
263
|
return merged_filter
|
|
234
264
|
|
|
265
|
+
def dfs(self) -> Iterator[_BaseFilter]:
|
|
266
|
+
yield self
|
|
267
|
+
for filter in self.or_:
|
|
268
|
+
yield from filter.dfs()
|
|
269
|
+
|
|
235
270
|
|
|
236
271
|
class _Not(_BaseFilter):
|
|
237
|
-
"""Represents a NOT filter"""
|
|
272
|
+
"""Represents a NOT filter."""
|
|
238
273
|
|
|
239
274
|
not_: "Filter" = pydantic.Field(alias="not")
|
|
240
275
|
|
|
@@ -262,6 +297,10 @@ class _Not(_BaseFilter):
|
|
|
262
297
|
|
|
263
298
|
return final_filters
|
|
264
299
|
|
|
300
|
+
def dfs(self) -> Iterator[_BaseFilter]:
|
|
301
|
+
yield self
|
|
302
|
+
yield from self.not_.dfs()
|
|
303
|
+
|
|
265
304
|
|
|
266
305
|
# TODO: With pydantic 2, we can use a RootModel with a
|
|
267
306
|
# discriminated union to make the error messages more informative.
|
|
@@ -271,6 +310,7 @@ Filter = Union[
|
|
|
271
310
|
_Not,
|
|
272
311
|
_EntityTypeFilter,
|
|
273
312
|
_EntitySubtypeFilter,
|
|
313
|
+
_StatusFilter,
|
|
274
314
|
_PlatformFilter,
|
|
275
315
|
_DomainFilter,
|
|
276
316
|
_EnvFilter,
|
|
@@ -318,6 +358,18 @@ class FilterDsl:
|
|
|
318
358
|
def not_(arg: "Filter") -> _Not:
|
|
319
359
|
return _Not(not_=arg)
|
|
320
360
|
|
|
361
|
+
@staticmethod
|
|
362
|
+
def true() -> "Filter":
|
|
363
|
+
return _CustomCondition(
|
|
364
|
+
field="urn",
|
|
365
|
+
condition="EXISTS",
|
|
366
|
+
values=[],
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
@staticmethod
|
|
370
|
+
def false() -> "Filter":
|
|
371
|
+
return FilterDsl.not_(FilterDsl.true())
|
|
372
|
+
|
|
321
373
|
@staticmethod
|
|
322
374
|
def entity_type(
|
|
323
375
|
entity_type: Union[EntityTypeName, Sequence[EntityTypeName]],
|
|
@@ -329,14 +381,15 @@ class FilterDsl:
|
|
|
329
381
|
)
|
|
330
382
|
|
|
331
383
|
@staticmethod
|
|
332
|
-
def entity_subtype(
|
|
384
|
+
def entity_subtype(
|
|
385
|
+
entity_subtype: Union[str, Sequence[str]],
|
|
386
|
+
) -> _EntitySubtypeFilter:
|
|
333
387
|
return _EntitySubtypeFilter(
|
|
334
|
-
|
|
335
|
-
entity_subtype=subtype,
|
|
388
|
+
entity_subtype=entity_subtype,
|
|
336
389
|
)
|
|
337
390
|
|
|
338
391
|
@staticmethod
|
|
339
|
-
def platform(platform: Union[str,
|
|
392
|
+
def platform(platform: Union[str, Sequence[str]], /) -> _PlatformFilter:
|
|
340
393
|
return _PlatformFilter(
|
|
341
394
|
platform=[platform] if isinstance(platform, str) else platform
|
|
342
395
|
)
|
|
@@ -344,11 +397,11 @@ class FilterDsl:
|
|
|
344
397
|
# TODO: Add a platform_instance filter
|
|
345
398
|
|
|
346
399
|
@staticmethod
|
|
347
|
-
def domain(domain: Union[str,
|
|
400
|
+
def domain(domain: Union[str, Sequence[str]], /) -> _DomainFilter:
|
|
348
401
|
return _DomainFilter(domain=[domain] if isinstance(domain, str) else domain)
|
|
349
402
|
|
|
350
403
|
@staticmethod
|
|
351
|
-
def env(env: Union[str,
|
|
404
|
+
def env(env: Union[str, Sequence[str]], /) -> _EnvFilter:
|
|
352
405
|
return _EnvFilter(env=[env] if isinstance(env, str) else env)
|
|
353
406
|
|
|
354
407
|
@staticmethod
|
|
@@ -359,13 +412,17 @@ class FilterDsl:
|
|
|
359
412
|
values=[f"{key}={value}"],
|
|
360
413
|
)
|
|
361
414
|
|
|
415
|
+
@staticmethod
|
|
416
|
+
def soft_deleted(status: RemovedStatusFilter) -> _StatusFilter:
|
|
417
|
+
return _StatusFilter(status=status)
|
|
418
|
+
|
|
362
419
|
# TODO: Add a soft-deletion status filter
|
|
363
420
|
# TODO: add a container / browse path filter
|
|
364
421
|
# TODO add shortcut for custom filters
|
|
365
422
|
|
|
366
423
|
@staticmethod
|
|
367
424
|
def custom_filter(
|
|
368
|
-
field: str, condition:
|
|
425
|
+
field: str, condition: FilterOperator, values: Sequence[str]
|
|
369
426
|
) -> _CustomCondition:
|
|
370
427
|
return _CustomCondition(
|
|
371
428
|
field=field,
|
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import re
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import Iterator, List, Tuple
|
|
4
5
|
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
5
7
|
SELECT_KEYWORD = "SELECT"
|
|
6
8
|
CASE_KEYWORD = "CASE"
|
|
7
9
|
END_KEYWORD = "END"
|
|
@@ -120,7 +122,9 @@ class _StatementSplitter:
|
|
|
120
122
|
# Reset current_statement-specific state.
|
|
121
123
|
self.does_select_mean_new_statement = False
|
|
122
124
|
if self.current_case_statements != 0:
|
|
123
|
-
|
|
125
|
+
logger.warning(
|
|
126
|
+
f"Unexpected END keyword. Current case statements: {self.current_case_statements}"
|
|
127
|
+
)
|
|
124
128
|
self.current_case_statements = 0
|
|
125
129
|
|
|
126
130
|
def process(self) -> Iterator[str]:
|
|
@@ -233,8 +237,10 @@ class _StatementSplitter:
|
|
|
233
237
|
),
|
|
234
238
|
)
|
|
235
239
|
if (
|
|
236
|
-
is_force_new_statement_keyword
|
|
237
|
-
|
|
240
|
+
is_force_new_statement_keyword
|
|
241
|
+
and not self._has_preceding_cte(most_recent_real_char)
|
|
242
|
+
and not self._is_part_of_merge_query()
|
|
243
|
+
):
|
|
238
244
|
# Force termination of current statement
|
|
239
245
|
yield from self._yield_if_complete()
|
|
240
246
|
|
|
@@ -247,6 +253,14 @@ class _StatementSplitter:
|
|
|
247
253
|
else:
|
|
248
254
|
self.current_statement.append(c)
|
|
249
255
|
|
|
256
|
+
def _has_preceding_cte(self, most_recent_real_char: str) -> bool:
|
|
257
|
+
# usually we'd have a close paren that closes a CTE
|
|
258
|
+
return most_recent_real_char == ")"
|
|
259
|
+
|
|
260
|
+
def _is_part_of_merge_query(self) -> bool:
|
|
261
|
+
# In merge statement we'd have `when matched then` or `when not matched then"
|
|
262
|
+
return "".join(self.current_statement).strip().lower().endswith("then")
|
|
263
|
+
|
|
250
264
|
|
|
251
265
|
def split_statements(sql: str) -> Iterator[str]:
|
|
252
266
|
"""
|
|
@@ -30,6 +30,7 @@ from datahub.metadata.urns import (
|
|
|
30
30
|
DatasetUrn,
|
|
31
31
|
QueryUrn,
|
|
32
32
|
SchemaFieldUrn,
|
|
33
|
+
Urn,
|
|
33
34
|
)
|
|
34
35
|
from datahub.sql_parsing.schema_resolver import (
|
|
35
36
|
SchemaResolver,
|
|
@@ -139,6 +140,8 @@ class QueryMetadata:
|
|
|
139
140
|
|
|
140
141
|
used_temp_tables: bool = True
|
|
141
142
|
|
|
143
|
+
origin: Optional[Urn] = None
|
|
144
|
+
|
|
142
145
|
def make_created_audit_stamp(self) -> models.AuditStampClass:
|
|
143
146
|
return models.AuditStampClass(
|
|
144
147
|
time=make_ts_millis(self.latest_timestamp) or 0,
|
|
@@ -221,6 +224,7 @@ class PreparsedQuery:
|
|
|
221
224
|
)
|
|
222
225
|
# Use this to store addtitional key-value information about query for debugging
|
|
223
226
|
extra_info: Optional[dict] = None
|
|
227
|
+
origin: Optional[Urn] = None
|
|
224
228
|
|
|
225
229
|
|
|
226
230
|
@dataclasses.dataclass
|
|
@@ -903,6 +907,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
903
907
|
column_usage=parsed.column_usage or {},
|
|
904
908
|
confidence_score=parsed.confidence_score,
|
|
905
909
|
used_temp_tables=session_has_temp_tables,
|
|
910
|
+
origin=parsed.origin,
|
|
906
911
|
)
|
|
907
912
|
)
|
|
908
913
|
|
|
@@ -1464,6 +1469,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1464
1469
|
source=models.QuerySourceClass.SYSTEM,
|
|
1465
1470
|
created=query.make_created_audit_stamp(),
|
|
1466
1471
|
lastModified=query.make_last_modified_audit_stamp(),
|
|
1472
|
+
origin=query.origin.urn() if query.origin else None,
|
|
1467
1473
|
),
|
|
1468
1474
|
models.QuerySubjectsClass(
|
|
1469
1475
|
subjects=[
|
|
@@ -13,7 +13,7 @@ from datahub.api.entities.platformresource.platform_resource import (
|
|
|
13
13
|
)
|
|
14
14
|
from datahub.ingestion.api.report import Report
|
|
15
15
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
16
|
-
from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn
|
|
16
|
+
from datahub.metadata.urns import CorpGroupUrn, CorpUserUrn, DataPlatformUrn, Urn
|
|
17
17
|
from datahub.utilities.search_utils import LogicalOperator
|
|
18
18
|
from datahub.utilities.stats_collections import int_top_k_dict
|
|
19
19
|
|
|
@@ -21,6 +21,10 @@ UrnStr = str
|
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
|
+
MODE_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:mode")
|
|
25
|
+
LOOKER_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:looker")
|
|
26
|
+
HEX_PLATFORM_URN = DataPlatformUrn.from_string("urn:li:dataPlatform:hex")
|
|
27
|
+
|
|
24
28
|
|
|
25
29
|
class QueryLog(Protocol):
|
|
26
30
|
"""Represents Query Log Entry
|
|
@@ -30,6 +34,7 @@ class QueryLog(Protocol):
|
|
|
30
34
|
query_text: str
|
|
31
35
|
user: Optional[Union[CorpUserUrn, CorpGroupUrn]]
|
|
32
36
|
extra_info: Optional[dict]
|
|
37
|
+
origin: Optional[Urn]
|
|
33
38
|
|
|
34
39
|
|
|
35
40
|
def _get_last_line(query: str) -> str:
|
|
@@ -67,6 +72,10 @@ class ToolMetaExtractor:
|
|
|
67
72
|
"looker",
|
|
68
73
|
self._extract_looker_query,
|
|
69
74
|
),
|
|
75
|
+
(
|
|
76
|
+
"hex",
|
|
77
|
+
self._extract_hex_query,
|
|
78
|
+
),
|
|
70
79
|
]
|
|
71
80
|
# maps user id (as string) to email address
|
|
72
81
|
self.looker_user_mapping = looker_user_mapping
|
|
@@ -153,7 +162,7 @@ class ToolMetaExtractor:
|
|
|
153
162
|
entry.extra_info = entry.extra_info or {}
|
|
154
163
|
entry.extra_info["user_via"] = original_user
|
|
155
164
|
|
|
156
|
-
|
|
165
|
+
entry.origin = MODE_PLATFORM_URN
|
|
157
166
|
|
|
158
167
|
return True
|
|
159
168
|
|
|
@@ -190,6 +199,22 @@ class ToolMetaExtractor:
|
|
|
190
199
|
entry.extra_info = entry.extra_info or {}
|
|
191
200
|
entry.extra_info["user_via"] = original_user
|
|
192
201
|
|
|
202
|
+
entry.origin = LOOKER_PLATFORM_URN
|
|
203
|
+
|
|
204
|
+
return True
|
|
205
|
+
|
|
206
|
+
def _extract_hex_query(self, entry: QueryLog) -> bool:
|
|
207
|
+
"""
|
|
208
|
+
Returns:
|
|
209
|
+
bool: whether QueryLog entry is that of hex.
|
|
210
|
+
"""
|
|
211
|
+
last_line = _get_last_line(entry.query_text)
|
|
212
|
+
|
|
213
|
+
if not last_line.startswith("-- Hex query metadata:"):
|
|
214
|
+
return False
|
|
215
|
+
|
|
216
|
+
entry.origin = HEX_PLATFORM_URN
|
|
217
|
+
|
|
193
218
|
return True
|
|
194
219
|
|
|
195
220
|
def extract_bi_metadata(self, entry: QueryLog) -> bool:
|
datahub/testing/mcp_diff.py
CHANGED
|
@@ -8,7 +8,6 @@ import deepdiff.serialization
|
|
|
8
8
|
import yaml
|
|
9
9
|
from deepdiff import DeepDiff
|
|
10
10
|
from deepdiff.model import DiffLevel
|
|
11
|
-
from deepdiff.operator import BaseOperator
|
|
12
11
|
from typing_extensions import Literal
|
|
13
12
|
|
|
14
13
|
ReportType = Literal[
|
|
@@ -59,27 +58,12 @@ class AspectForDiff:
|
|
|
59
58
|
|
|
60
59
|
@dataclasses.dataclass
|
|
61
60
|
class DeltaInfo:
|
|
62
|
-
"""Information about an MCP used to construct a diff delta.
|
|
63
|
-
|
|
64
|
-
In a separate class so it can be ignored by DeepDiff via MCPDeltaInfoOperator.
|
|
65
|
-
"""
|
|
61
|
+
"""Information about an MCP used to construct a diff delta."""
|
|
66
62
|
|
|
67
63
|
idx: int # Location in list of MCEs in golden file
|
|
68
64
|
original: Dict[str, Any] # Original json-serialized MCP
|
|
69
65
|
|
|
70
66
|
|
|
71
|
-
class DeltaInfoOperator(BaseOperator):
|
|
72
|
-
"""Warning: Doesn't seem to be working right now.
|
|
73
|
-
Ignored via an ignore path as an extra layer of defense.
|
|
74
|
-
"""
|
|
75
|
-
|
|
76
|
-
def __init__(self):
|
|
77
|
-
super().__init__(types=[DeltaInfo])
|
|
78
|
-
|
|
79
|
-
def give_up_diffing(self, *args: Any, **kwargs: Any) -> bool:
|
|
80
|
-
return True
|
|
81
|
-
|
|
82
|
-
|
|
83
67
|
AspectsByUrn = Dict[str, Dict[str, List[AspectForDiff]]]
|
|
84
68
|
|
|
85
69
|
|
|
@@ -176,7 +160,6 @@ class MCPDiff:
|
|
|
176
160
|
t2=t2,
|
|
177
161
|
exclude_regex_paths=ignore_paths,
|
|
178
162
|
ignore_order=True,
|
|
179
|
-
custom_operators=[DeltaInfoOperator()],
|
|
180
163
|
)
|
|
181
164
|
if diff:
|
|
182
165
|
aspect_changes[urn][aspect_name] = MCPAspectDiff.create(diff)
|
|
@@ -1,7 +1,15 @@
|
|
|
1
1
|
import concurrent.futures
|
|
2
2
|
import contextlib
|
|
3
3
|
import queue
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import (
|
|
5
|
+
Any,
|
|
6
|
+
Callable,
|
|
7
|
+
Iterable,
|
|
8
|
+
Iterator,
|
|
9
|
+
Optional,
|
|
10
|
+
Tuple,
|
|
11
|
+
TypeVar,
|
|
12
|
+
)
|
|
5
13
|
|
|
6
14
|
T = TypeVar("T")
|
|
7
15
|
|
|
@@ -18,8 +26,13 @@ class ThreadedIteratorExecutor:
|
|
|
18
26
|
worker_func: Callable[..., Iterable[T]],
|
|
19
27
|
args_list: Iterable[Tuple[Any, ...]],
|
|
20
28
|
max_workers: int,
|
|
21
|
-
|
|
22
|
-
|
|
29
|
+
max_backpressure: Optional[int] = None,
|
|
30
|
+
) -> Iterator[T]:
|
|
31
|
+
if max_backpressure is None:
|
|
32
|
+
max_backpressure = 10 * max_workers
|
|
33
|
+
assert max_backpressure >= max_workers
|
|
34
|
+
|
|
35
|
+
out_q: queue.Queue[T] = queue.Queue(maxsize=max_backpressure)
|
|
23
36
|
|
|
24
37
|
def _worker_wrapper(
|
|
25
38
|
worker_func: Callable[..., Iterable[T]], *args: Any
|