acryl-datahub 1.0.0rc8__py3-none-any.whl → 1.0.0rc10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/METADATA +2623 -2624
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/RECORD +53 -49
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +731 -42
- datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
- datahub/cli/specific/dataset_cli.py +128 -14
- datahub/emitter/mce_builder.py +28 -13
- datahub/ingestion/graph/client.py +15 -11
- datahub/ingestion/graph/filters.py +64 -37
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/common/subtypes.py +7 -0
- datahub/ingestion/source/identity/okta.py +22 -0
- datahub/ingestion/source/metabase.py +3 -3
- datahub/ingestion/source/mode.py +1 -1
- datahub/ingestion/source/preset.py +7 -4
- datahub/ingestion/source/sql/mssql/job_models.py +29 -0
- datahub/ingestion/source/sql/mssql/source.py +10 -4
- datahub/ingestion/source/superset.py +158 -24
- datahub/metadata/_schema_classes.py +157 -14
- datahub/metadata/_urns/urn_defs.py +82 -58
- datahub/metadata/schema.avsc +23 -10
- datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessKey.avsc +2 -1
- datahub/metadata/schemas/DataProductKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryNodeKey.avsc +2 -1
- datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
- datahub/metadata/schemas/MLModelGroupKey.avsc +2 -1
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
- datahub/metadata/schemas/PostKey.avsc +2 -1
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/VersionProperties.avsc +18 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
- datahub/pydantic/__init__.py +0 -0
- datahub/pydantic/compat.py +58 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/_all_entities.py +1 -1
- datahub/sdk/_shared.py +88 -3
- datahub/sdk/container.py +7 -1
- datahub/sdk/dataset.py +7 -1
- datahub/sdk/{_entity.py → entity.py} +4 -0
- datahub/sdk/entity_client.py +1 -1
- datahub/sdk/main_client.py +7 -1
- datahub/sdk/resolver_client.py +17 -29
- datahub/sdk/search_client.py +50 -0
- datahub/sdk/search_filters.py +374 -0
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc8.dist-info → acryl_datahub-1.0.0rc10.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Iterable, List, Optional
|
|
4
|
+
from typing import Iterable, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
import yaml
|
|
7
7
|
from pydantic import validator
|
|
@@ -38,7 +38,7 @@ class AllowedTypes(Enum):
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
class AllowedValue(ConfigModel):
|
|
41
|
-
value: str
|
|
41
|
+
value: Union[int, float, str]
|
|
42
42
|
description: Optional[str] = None
|
|
43
43
|
|
|
44
44
|
|
|
@@ -1,12 +1,15 @@
|
|
|
1
|
+
import filecmp
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
3
6
|
from pathlib import Path
|
|
4
|
-
from typing import Set, Tuple
|
|
7
|
+
from typing import List, Set, Tuple
|
|
5
8
|
|
|
6
9
|
import click
|
|
7
10
|
from click_default_group import DefaultGroup
|
|
8
11
|
|
|
9
|
-
from datahub.api.entities.dataset.dataset import Dataset
|
|
12
|
+
from datahub.api.entities.dataset.dataset import Dataset, DatasetRetrievalConfig
|
|
10
13
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
11
14
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
12
15
|
from datahub.metadata.com.linkedin.pegasus2avro.common import Siblings
|
|
@@ -30,18 +33,9 @@ def dataset() -> None:
|
|
|
30
33
|
@telemetry.with_telemetry()
|
|
31
34
|
def upsert(file: Path) -> None:
|
|
32
35
|
"""Upsert attributes to a Dataset in DataHub."""
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
try:
|
|
37
|
-
for mcp in dataset.generate_mcp():
|
|
38
|
-
graph.emit(mcp)
|
|
39
|
-
click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
|
|
40
|
-
except Exception as e:
|
|
41
|
-
click.secho(
|
|
42
|
-
f"Update failed for id {id}. due to {e}",
|
|
43
|
-
fg="red",
|
|
44
|
-
)
|
|
36
|
+
# Call the sync command with to_datahub=True to perform the upsert operation
|
|
37
|
+
ctx = click.get_current_context()
|
|
38
|
+
ctx.invoke(sync, file=str(file), to_datahub=True)
|
|
45
39
|
|
|
46
40
|
|
|
47
41
|
@dataset.command(
|
|
@@ -111,3 +105,123 @@ def _get_existing_siblings(graph: DataHubGraph, urn: str) -> Set[str]:
|
|
|
111
105
|
return set(existing.siblings)
|
|
112
106
|
else:
|
|
113
107
|
return set()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataset.command(
|
|
111
|
+
name="file",
|
|
112
|
+
)
|
|
113
|
+
@click.option("--lintCheck", required=False, is_flag=True)
|
|
114
|
+
@click.option("--lintFix", required=False, is_flag=True)
|
|
115
|
+
@click.argument("file", type=click.Path(exists=True))
|
|
116
|
+
@upgrade.check_upgrade
|
|
117
|
+
@telemetry.with_telemetry()
|
|
118
|
+
def file(lintcheck: bool, lintfix: bool, file: str) -> None:
|
|
119
|
+
"""Operate on a Dataset file"""
|
|
120
|
+
|
|
121
|
+
if lintcheck or lintfix:
|
|
122
|
+
import tempfile
|
|
123
|
+
from pathlib import Path
|
|
124
|
+
|
|
125
|
+
# Create a temporary file in a secure way
|
|
126
|
+
# The file will be automatically deleted when the context manager exits
|
|
127
|
+
with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as temp:
|
|
128
|
+
temp_path = Path(temp.name)
|
|
129
|
+
try:
|
|
130
|
+
# Copy content to the temporary file
|
|
131
|
+
shutil.copyfile(file, temp_path)
|
|
132
|
+
|
|
133
|
+
# Run the linting
|
|
134
|
+
datasets = Dataset.from_yaml(temp.name)
|
|
135
|
+
for dataset in datasets:
|
|
136
|
+
dataset.to_yaml(temp_path)
|
|
137
|
+
|
|
138
|
+
# Compare the files
|
|
139
|
+
files_match = filecmp.cmp(file, temp_path)
|
|
140
|
+
|
|
141
|
+
if files_match:
|
|
142
|
+
click.secho("No differences found", fg="green")
|
|
143
|
+
else:
|
|
144
|
+
# Show diff for visibility
|
|
145
|
+
os.system(f"diff {file} {temp_path}")
|
|
146
|
+
|
|
147
|
+
if lintfix:
|
|
148
|
+
shutil.copyfile(temp_path, file)
|
|
149
|
+
click.secho(f"Fixed linting issues in {file}", fg="green")
|
|
150
|
+
else:
|
|
151
|
+
click.secho(
|
|
152
|
+
f"To fix these differences, run 'datahub dataset file --lintFix {file}'",
|
|
153
|
+
fg="yellow",
|
|
154
|
+
)
|
|
155
|
+
finally:
|
|
156
|
+
# Ensure the temporary file is removed
|
|
157
|
+
if temp_path.exists():
|
|
158
|
+
temp_path.unlink()
|
|
159
|
+
else:
|
|
160
|
+
click.secho(
|
|
161
|
+
"No operation specified. Choose from --lintCheck or --lintFix", fg="yellow"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@dataset.command(
|
|
166
|
+
name="sync",
|
|
167
|
+
)
|
|
168
|
+
@click.option("-f", "--file", required=True, type=click.Path(exists=True))
|
|
169
|
+
@click.option("--to-datahub/--from-datahub", required=True, is_flag=True)
|
|
170
|
+
@upgrade.check_upgrade
|
|
171
|
+
@telemetry.with_telemetry()
|
|
172
|
+
def sync(file: str, to_datahub: bool) -> None:
|
|
173
|
+
"""Sync a Dataset file to/from DataHub"""
|
|
174
|
+
|
|
175
|
+
failures: List[str] = []
|
|
176
|
+
with get_default_graph() as graph:
|
|
177
|
+
datasets = Dataset.from_yaml(file)
|
|
178
|
+
for dataset in datasets:
|
|
179
|
+
assert (
|
|
180
|
+
dataset.urn is not None
|
|
181
|
+
) # Validator should have ensured this is filled. Tell mypy it's not None
|
|
182
|
+
if to_datahub:
|
|
183
|
+
missing_entity_references = [
|
|
184
|
+
entity_reference
|
|
185
|
+
for entity_reference in dataset.entity_references()
|
|
186
|
+
if not graph.exists(entity_reference)
|
|
187
|
+
]
|
|
188
|
+
if missing_entity_references:
|
|
189
|
+
click.secho(
|
|
190
|
+
"\n\t- ".join(
|
|
191
|
+
[
|
|
192
|
+
f"Skipping Dataset {dataset.urn} due to missing entity references: "
|
|
193
|
+
]
|
|
194
|
+
+ missing_entity_references
|
|
195
|
+
),
|
|
196
|
+
fg="red",
|
|
197
|
+
)
|
|
198
|
+
failures.append(dataset.urn)
|
|
199
|
+
continue
|
|
200
|
+
try:
|
|
201
|
+
for mcp in dataset.generate_mcp():
|
|
202
|
+
graph.emit(mcp)
|
|
203
|
+
click.secho(f"Update succeeded for urn {dataset.urn}.", fg="green")
|
|
204
|
+
except Exception as e:
|
|
205
|
+
click.secho(
|
|
206
|
+
f"Update failed for id {id}. due to {e}",
|
|
207
|
+
fg="red",
|
|
208
|
+
)
|
|
209
|
+
else:
|
|
210
|
+
# Sync from DataHub
|
|
211
|
+
if graph.exists(dataset.urn):
|
|
212
|
+
dataset_get_config = DatasetRetrievalConfig()
|
|
213
|
+
if dataset.downstreams:
|
|
214
|
+
dataset_get_config.include_downstreams = True
|
|
215
|
+
existing_dataset: Dataset = Dataset.from_datahub(
|
|
216
|
+
graph=graph, urn=dataset.urn, config=dataset_get_config
|
|
217
|
+
)
|
|
218
|
+
existing_dataset.to_yaml(Path(file))
|
|
219
|
+
else:
|
|
220
|
+
click.secho(f"Dataset {dataset.urn} does not exist")
|
|
221
|
+
failures.append(dataset.urn)
|
|
222
|
+
if failures:
|
|
223
|
+
click.secho(
|
|
224
|
+
f"\nFailed to sync the following Datasets: {', '.join(failures)}",
|
|
225
|
+
fg="red",
|
|
226
|
+
)
|
|
227
|
+
raise click.Abort()
|
datahub/emitter/mce_builder.py
CHANGED
|
@@ -52,7 +52,15 @@ from datahub.metadata.schema_classes import (
|
|
|
52
52
|
UpstreamLineageClass,
|
|
53
53
|
_Aspect as AspectAbstract,
|
|
54
54
|
)
|
|
55
|
-
from datahub.metadata.urns import
|
|
55
|
+
from datahub.metadata.urns import (
|
|
56
|
+
ChartUrn,
|
|
57
|
+
DashboardUrn,
|
|
58
|
+
DataFlowUrn,
|
|
59
|
+
DataJobUrn,
|
|
60
|
+
DataPlatformUrn,
|
|
61
|
+
DatasetUrn,
|
|
62
|
+
TagUrn,
|
|
63
|
+
)
|
|
56
64
|
from datahub.utilities.urn_encoder import UrnEncoder
|
|
57
65
|
|
|
58
66
|
logger = logging.getLogger(__name__)
|
|
@@ -119,7 +127,7 @@ def parse_ts_millis(ts: Optional[float]) -> Optional[datetime]:
|
|
|
119
127
|
def make_data_platform_urn(platform: str) -> str:
|
|
120
128
|
if platform.startswith("urn:li:dataPlatform:"):
|
|
121
129
|
return platform
|
|
122
|
-
return
|
|
130
|
+
return DataPlatformUrn.create_from_id(platform).urn()
|
|
123
131
|
|
|
124
132
|
|
|
125
133
|
def make_dataset_urn(platform: str, name: str, env: str = DEFAULT_ENV) -> str:
|
|
@@ -236,7 +244,7 @@ def make_user_urn(username: str) -> str:
|
|
|
236
244
|
Makes a user urn if the input is not a user or group urn already
|
|
237
245
|
"""
|
|
238
246
|
return (
|
|
239
|
-
f"urn:li:corpuser:{username}"
|
|
247
|
+
f"urn:li:corpuser:{UrnEncoder.encode_string(username)}"
|
|
240
248
|
if not username.startswith(("urn:li:corpuser:", "urn:li:corpGroup:"))
|
|
241
249
|
else username
|
|
242
250
|
)
|
|
@@ -249,7 +257,7 @@ def make_group_urn(groupname: str) -> str:
|
|
|
249
257
|
if groupname and groupname.startswith(("urn:li:corpGroup:", "urn:li:corpuser:")):
|
|
250
258
|
return groupname
|
|
251
259
|
else:
|
|
252
|
-
return f"urn:li:corpGroup:{groupname}"
|
|
260
|
+
return f"urn:li:corpGroup:{UrnEncoder.encode_string(groupname)}"
|
|
253
261
|
|
|
254
262
|
|
|
255
263
|
def make_tag_urn(tag: str) -> str:
|
|
@@ -301,7 +309,12 @@ def make_data_flow_urn(
|
|
|
301
309
|
|
|
302
310
|
|
|
303
311
|
def make_data_job_urn_with_flow(flow_urn: str, job_id: str) -> str:
|
|
304
|
-
|
|
312
|
+
data_flow_urn = DataFlowUrn.from_string(flow_urn)
|
|
313
|
+
data_job_urn = DataJobUrn.create_from_ids(
|
|
314
|
+
data_flow_urn=data_flow_urn.urn(),
|
|
315
|
+
job_id=job_id,
|
|
316
|
+
)
|
|
317
|
+
return data_job_urn.urn()
|
|
305
318
|
|
|
306
319
|
|
|
307
320
|
def make_data_process_instance_urn(dataProcessInstanceId: str) -> str:
|
|
@@ -324,10 +337,11 @@ def make_dashboard_urn(
|
|
|
324
337
|
platform: str, name: str, platform_instance: Optional[str] = None
|
|
325
338
|
) -> str:
|
|
326
339
|
# FIXME: dashboards don't currently include data platform urn prefixes.
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
340
|
+
return DashboardUrn.create_from_ids(
|
|
341
|
+
platform=platform,
|
|
342
|
+
name=name,
|
|
343
|
+
platform_instance=platform_instance,
|
|
344
|
+
).urn()
|
|
331
345
|
|
|
332
346
|
|
|
333
347
|
def dashboard_urn_to_key(dashboard_urn: str) -> Optional[DashboardKeyClass]:
|
|
@@ -342,10 +356,11 @@ def make_chart_urn(
|
|
|
342
356
|
platform: str, name: str, platform_instance: Optional[str] = None
|
|
343
357
|
) -> str:
|
|
344
358
|
# FIXME: charts don't currently include data platform urn prefixes.
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
359
|
+
return ChartUrn.create_from_ids(
|
|
360
|
+
platform=platform,
|
|
361
|
+
name=name,
|
|
362
|
+
platform_instance=platform_instance,
|
|
363
|
+
).urn()
|
|
349
364
|
|
|
350
365
|
|
|
351
366
|
def chart_urn_to_key(chart_urn: str) -> Optional[ChartKeyClass]:
|
|
@@ -16,6 +16,7 @@ from typing import (
|
|
|
16
16
|
List,
|
|
17
17
|
Literal,
|
|
18
18
|
Optional,
|
|
19
|
+
Sequence,
|
|
19
20
|
Tuple,
|
|
20
21
|
Type,
|
|
21
22
|
Union,
|
|
@@ -42,8 +43,8 @@ from datahub.ingestion.graph.connections import (
|
|
|
42
43
|
)
|
|
43
44
|
from datahub.ingestion.graph.entity_versioning import EntityVersioningAPI
|
|
44
45
|
from datahub.ingestion.graph.filters import (
|
|
46
|
+
RawSearchFilterRule,
|
|
45
47
|
RemovedStatusFilter,
|
|
46
|
-
SearchFilterRule,
|
|
47
48
|
generate_filter,
|
|
48
49
|
)
|
|
49
50
|
from datahub.ingestion.source.state.checkpoint import Checkpoint
|
|
@@ -105,7 +106,7 @@ class RelatedEntity:
|
|
|
105
106
|
via: Optional[str] = None
|
|
106
107
|
|
|
107
108
|
|
|
108
|
-
def
|
|
109
|
+
def entity_type_to_graphql(entity_type: str) -> str:
|
|
109
110
|
"""Convert the entity types into GraphQL "EntityType" enum values."""
|
|
110
111
|
|
|
111
112
|
# Hard-coded special cases.
|
|
@@ -797,13 +798,13 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
797
798
|
container: Optional[str] = None,
|
|
798
799
|
status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
|
|
799
800
|
batch_size: int = 100,
|
|
800
|
-
extraFilters: Optional[List[
|
|
801
|
+
extraFilters: Optional[List[RawSearchFilterRule]] = None,
|
|
801
802
|
) -> Iterable[Tuple[str, "GraphQLSchemaMetadata"]]:
|
|
802
803
|
"""Fetch schema info for datasets that match all of the given filters.
|
|
803
804
|
|
|
804
805
|
:return: An iterable of (urn, schema info) tuple that match the filters.
|
|
805
806
|
"""
|
|
806
|
-
types = [
|
|
807
|
+
types = [entity_type_to_graphql("dataset")]
|
|
807
808
|
|
|
808
809
|
# Add the query default of * if no query is specified.
|
|
809
810
|
query = query or "*"
|
|
@@ -865,7 +866,7 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
865
866
|
def get_urns_by_filter(
|
|
866
867
|
self,
|
|
867
868
|
*,
|
|
868
|
-
entity_types: Optional[
|
|
869
|
+
entity_types: Optional[Sequence[str]] = None,
|
|
869
870
|
platform: Optional[str] = None,
|
|
870
871
|
platform_instance: Optional[str] = None,
|
|
871
872
|
env: Optional[str] = None,
|
|
@@ -873,8 +874,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
873
874
|
container: Optional[str] = None,
|
|
874
875
|
status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
|
|
875
876
|
batch_size: int = 10000,
|
|
876
|
-
extraFilters: Optional[List[
|
|
877
|
-
extra_or_filters: Optional[List[Dict[str, List[
|
|
877
|
+
extraFilters: Optional[List[RawSearchFilterRule]] = None,
|
|
878
|
+
extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
|
|
878
879
|
) -> Iterable[str]:
|
|
879
880
|
"""Fetch all urns that match all of the given filters.
|
|
880
881
|
|
|
@@ -965,8 +966,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
965
966
|
container: Optional[str] = None,
|
|
966
967
|
status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
|
|
967
968
|
batch_size: int = 10000,
|
|
968
|
-
extra_and_filters: Optional[List[
|
|
969
|
-
extra_or_filters: Optional[List[Dict[str, List[
|
|
969
|
+
extra_and_filters: Optional[List[RawSearchFilterRule]] = None,
|
|
970
|
+
extra_or_filters: Optional[List[Dict[str, List[RawSearchFilterRule]]]] = None,
|
|
970
971
|
extra_source_fields: Optional[List[str]] = None,
|
|
971
972
|
skip_cache: bool = False,
|
|
972
973
|
) -> Iterable[dict]:
|
|
@@ -1109,7 +1110,8 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1109
1110
|
f"Scrolling to next scrollAcrossEntities page: {scroll_id}"
|
|
1110
1111
|
)
|
|
1111
1112
|
|
|
1112
|
-
|
|
1113
|
+
@classmethod
|
|
1114
|
+
def _get_types(cls, entity_types: Optional[Sequence[str]]) -> Optional[List[str]]:
|
|
1113
1115
|
types: Optional[List[str]] = None
|
|
1114
1116
|
if entity_types is not None:
|
|
1115
1117
|
if not entity_types:
|
|
@@ -1117,7 +1119,9 @@ class DataHubGraph(DatahubRestEmitter, EntityVersioningAPI):
|
|
|
1117
1119
|
"entity_types cannot be an empty list; use None for all entities"
|
|
1118
1120
|
)
|
|
1119
1121
|
|
|
1120
|
-
types = [
|
|
1122
|
+
types = [
|
|
1123
|
+
entity_type_to_graphql(entity_type) for entity_type in entity_types
|
|
1124
|
+
]
|
|
1121
1125
|
return types
|
|
1122
1126
|
|
|
1123
1127
|
def get_latest_pipeline_checkpoint(
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
import enum
|
|
2
3
|
from typing import Any, Dict, List, Optional
|
|
3
4
|
|
|
@@ -7,7 +8,31 @@ from datahub.emitter.mce_builder import (
|
|
|
7
8
|
)
|
|
8
9
|
from datahub.utilities.urns.urn import guess_entity_type
|
|
9
10
|
|
|
10
|
-
|
|
11
|
+
RawSearchFilterRule = Dict[str, Any]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclasses.dataclass
|
|
15
|
+
class SearchFilterRule:
|
|
16
|
+
field: str
|
|
17
|
+
condition: str # TODO: convert to an enum
|
|
18
|
+
values: List[str]
|
|
19
|
+
negated: bool = False
|
|
20
|
+
|
|
21
|
+
def to_raw(self) -> RawSearchFilterRule:
|
|
22
|
+
return {
|
|
23
|
+
"field": self.field,
|
|
24
|
+
"condition": self.condition,
|
|
25
|
+
"values": self.values,
|
|
26
|
+
"negated": self.negated,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
def negate(self) -> "SearchFilterRule":
|
|
30
|
+
return SearchFilterRule(
|
|
31
|
+
field=self.field,
|
|
32
|
+
condition=self.condition,
|
|
33
|
+
values=self.values,
|
|
34
|
+
negated=not self.negated,
|
|
35
|
+
)
|
|
11
36
|
|
|
12
37
|
|
|
13
38
|
class RemovedStatusFilter(enum.Enum):
|
|
@@ -29,9 +54,9 @@ def generate_filter(
|
|
|
29
54
|
env: Optional[str],
|
|
30
55
|
container: Optional[str],
|
|
31
56
|
status: RemovedStatusFilter,
|
|
32
|
-
extra_filters: Optional[List[
|
|
33
|
-
extra_or_filters: Optional[List[
|
|
34
|
-
) -> List[Dict[str, List[
|
|
57
|
+
extra_filters: Optional[List[RawSearchFilterRule]],
|
|
58
|
+
extra_or_filters: Optional[List[RawSearchFilterRule]] = None,
|
|
59
|
+
) -> List[Dict[str, List[RawSearchFilterRule]]]:
|
|
35
60
|
"""
|
|
36
61
|
Generate a search filter based on the provided parameters.
|
|
37
62
|
:param platform: The platform to filter by.
|
|
@@ -43,30 +68,32 @@ def generate_filter(
|
|
|
43
68
|
:param extra_or_filters: Extra OR filters to apply. These are combined with
|
|
44
69
|
the AND filters using an OR at the top level.
|
|
45
70
|
"""
|
|
46
|
-
and_filters: List[
|
|
71
|
+
and_filters: List[RawSearchFilterRule] = []
|
|
47
72
|
|
|
48
73
|
# Platform filter.
|
|
49
74
|
if platform:
|
|
50
|
-
and_filters.append(_get_platform_filter(platform))
|
|
75
|
+
and_filters.append(_get_platform_filter(platform).to_raw())
|
|
51
76
|
|
|
52
77
|
# Platform instance filter.
|
|
53
78
|
if platform_instance:
|
|
54
|
-
and_filters.append(
|
|
79
|
+
and_filters.append(
|
|
80
|
+
_get_platform_instance_filter(platform, platform_instance).to_raw()
|
|
81
|
+
)
|
|
55
82
|
|
|
56
83
|
# Browse path v2 filter.
|
|
57
84
|
if container:
|
|
58
|
-
and_filters.append(_get_container_filter(container))
|
|
85
|
+
and_filters.append(_get_container_filter(container).to_raw())
|
|
59
86
|
|
|
60
87
|
# Status filter.
|
|
61
88
|
status_filter = _get_status_filter(status)
|
|
62
89
|
if status_filter:
|
|
63
|
-
and_filters.append(status_filter)
|
|
90
|
+
and_filters.append(status_filter.to_raw())
|
|
64
91
|
|
|
65
92
|
# Extra filters.
|
|
66
93
|
if extra_filters:
|
|
67
94
|
and_filters += extra_filters
|
|
68
95
|
|
|
69
|
-
or_filters: List[Dict[str, List[
|
|
96
|
+
or_filters: List[Dict[str, List[RawSearchFilterRule]]] = [{"and": and_filters}]
|
|
70
97
|
|
|
71
98
|
# Env filter
|
|
72
99
|
if env:
|
|
@@ -89,7 +116,7 @@ def generate_filter(
|
|
|
89
116
|
return or_filters
|
|
90
117
|
|
|
91
118
|
|
|
92
|
-
def _get_env_filters(env: str) -> List[
|
|
119
|
+
def _get_env_filters(env: str) -> List[RawSearchFilterRule]:
|
|
93
120
|
# The env filter is a bit more tricky since it's not always stored
|
|
94
121
|
# in the same place in ElasticSearch.
|
|
95
122
|
return [
|
|
@@ -125,19 +152,19 @@ def _get_status_filter(status: RemovedStatusFilter) -> Optional[SearchFilterRule
|
|
|
125
152
|
# removed field is simply not present in the ElasticSearch document. Ideally this
|
|
126
153
|
# would be a "removed" : "false" filter, but that doesn't work. Instead, we need to
|
|
127
154
|
# use a negated filter.
|
|
128
|
-
return
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
155
|
+
return SearchFilterRule(
|
|
156
|
+
field="removed",
|
|
157
|
+
values=["true"],
|
|
158
|
+
condition="EQUAL",
|
|
159
|
+
negated=True,
|
|
160
|
+
)
|
|
134
161
|
|
|
135
162
|
elif status == RemovedStatusFilter.ONLY_SOFT_DELETED:
|
|
136
|
-
return
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
163
|
+
return SearchFilterRule(
|
|
164
|
+
field="removed",
|
|
165
|
+
values=["true"],
|
|
166
|
+
condition="EQUAL",
|
|
167
|
+
)
|
|
141
168
|
|
|
142
169
|
elif status == RemovedStatusFilter.ALL:
|
|
143
170
|
# We don't need to add a filter for this case.
|
|
@@ -152,11 +179,11 @@ def _get_container_filter(container: str) -> SearchFilterRule:
|
|
|
152
179
|
if guess_entity_type(container) != "container":
|
|
153
180
|
raise ValueError(f"Invalid container urn: {container}")
|
|
154
181
|
|
|
155
|
-
return
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
182
|
+
return SearchFilterRule(
|
|
183
|
+
field="browsePathV2",
|
|
184
|
+
values=[container],
|
|
185
|
+
condition="CONTAIN",
|
|
186
|
+
)
|
|
160
187
|
|
|
161
188
|
|
|
162
189
|
def _get_platform_instance_filter(
|
|
@@ -171,16 +198,16 @@ def _get_platform_instance_filter(
|
|
|
171
198
|
if guess_entity_type(platform_instance) != "dataPlatformInstance":
|
|
172
199
|
raise ValueError(f"Invalid data platform instance urn: {platform_instance}")
|
|
173
200
|
|
|
174
|
-
return
|
|
175
|
-
|
|
176
|
-
"
|
|
177
|
-
|
|
178
|
-
|
|
201
|
+
return SearchFilterRule(
|
|
202
|
+
field="platformInstance",
|
|
203
|
+
condition="EQUAL",
|
|
204
|
+
values=[platform_instance],
|
|
205
|
+
)
|
|
179
206
|
|
|
180
207
|
|
|
181
208
|
def _get_platform_filter(platform: str) -> SearchFilterRule:
|
|
182
|
-
return
|
|
183
|
-
|
|
184
|
-
"
|
|
185
|
-
|
|
186
|
-
|
|
209
|
+
return SearchFilterRule(
|
|
210
|
+
field="platform.keyword",
|
|
211
|
+
condition="EQUAL",
|
|
212
|
+
values=[make_data_platform_urn(platform)],
|
|
213
|
+
)
|
|
@@ -59,9 +59,9 @@ from datahub.metadata.schema_classes import (
|
|
|
59
59
|
UpstreamLineageClass,
|
|
60
60
|
ViewPropertiesClass,
|
|
61
61
|
)
|
|
62
|
-
from datahub.sdk._entity import Entity
|
|
63
62
|
from datahub.sdk.container import Container
|
|
64
63
|
from datahub.sdk.dataset import Dataset
|
|
64
|
+
from datahub.sdk.entity import Entity
|
|
65
65
|
|
|
66
66
|
logger = logging.getLogger(__name__)
|
|
67
67
|
|
|
@@ -60,8 +60,15 @@ class BIContainerSubTypes(StrEnum):
|
|
|
60
60
|
MODE_COLLECTION = "Collection"
|
|
61
61
|
|
|
62
62
|
|
|
63
|
+
class FlowContainerSubTypes(StrEnum):
|
|
64
|
+
MSSQL_JOB = "Job"
|
|
65
|
+
MSSQL_PROCEDURE_CONTAINER = "Procedures Container"
|
|
66
|
+
|
|
67
|
+
|
|
63
68
|
class JobContainerSubTypes(StrEnum):
|
|
64
69
|
NIFI_PROCESS_GROUP = "Process Group"
|
|
70
|
+
MSSQL_JOBSTEP = "Job Step"
|
|
71
|
+
MSSQL_STORED_PROCEDURE = "Stored Procedure"
|
|
65
72
|
|
|
66
73
|
|
|
67
74
|
class BIAssetSubTypes(StrEnum):
|
|
@@ -666,6 +666,27 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
666
666
|
self.config.okta_profile_to_username_regex,
|
|
667
667
|
)
|
|
668
668
|
|
|
669
|
+
def _map_okta_user_profile_custom_properties(
|
|
670
|
+
self, profile: UserProfile
|
|
671
|
+
) -> Dict[str, str]:
|
|
672
|
+
# filter out the common fields that are already mapped to the CorpUserInfo aspect and the private ones
|
|
673
|
+
return {
|
|
674
|
+
k: str(v)
|
|
675
|
+
for k, v in profile.__dict__.items()
|
|
676
|
+
if v
|
|
677
|
+
and k
|
|
678
|
+
not in [
|
|
679
|
+
"displayName",
|
|
680
|
+
"firstName",
|
|
681
|
+
"lastName",
|
|
682
|
+
"email",
|
|
683
|
+
"title",
|
|
684
|
+
"countryCode",
|
|
685
|
+
"department",
|
|
686
|
+
]
|
|
687
|
+
and not k.startswith("_")
|
|
688
|
+
}
|
|
689
|
+
|
|
669
690
|
# Converts Okta User Profile into a CorpUserInfo.
|
|
670
691
|
def _map_okta_user_profile(self, profile: UserProfile) -> CorpUserInfoClass:
|
|
671
692
|
# TODO: Extract user's manager if provided.
|
|
@@ -683,6 +704,7 @@ class OktaSource(StatefulIngestionSourceBase):
|
|
|
683
704
|
title=profile.title,
|
|
684
705
|
countryCode=profile.countryCode,
|
|
685
706
|
departmentName=profile.department,
|
|
707
|
+
customProperties=self._map_okta_user_profile_custom_properties(profile),
|
|
686
708
|
)
|
|
687
709
|
|
|
688
710
|
def _make_corp_group_urn(self, name: str) -> str:
|
|
@@ -313,7 +313,7 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
313
313
|
return None
|
|
314
314
|
|
|
315
315
|
dashboard_urn = builder.make_dashboard_urn(
|
|
316
|
-
self.platform, dashboard_details.get("id", "")
|
|
316
|
+
self.platform, str(dashboard_details.get("id", ""))
|
|
317
317
|
)
|
|
318
318
|
dashboard_snapshot = DashboardSnapshot(
|
|
319
319
|
urn=dashboard_urn,
|
|
@@ -337,7 +337,7 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
337
337
|
card_id = card_info.get("card").get("id", "")
|
|
338
338
|
if not card_id:
|
|
339
339
|
continue # most likely a virtual card without an id (text or heading), not relevant.
|
|
340
|
-
chart_urn = builder.make_chart_urn(self.platform, card_id)
|
|
340
|
+
chart_urn = builder.make_chart_urn(self.platform, str(card_id))
|
|
341
341
|
chart_urns.append(chart_urn)
|
|
342
342
|
|
|
343
343
|
dashboard_info_class = DashboardInfoClass(
|
|
@@ -459,7 +459,7 @@ class MetabaseSource(StatefulIngestionSourceBase):
|
|
|
459
459
|
)
|
|
460
460
|
return None
|
|
461
461
|
|
|
462
|
-
chart_urn = builder.make_chart_urn(self.platform, card_id)
|
|
462
|
+
chart_urn = builder.make_chart_urn(self.platform, str(card_id))
|
|
463
463
|
chart_snapshot = ChartSnapshot(
|
|
464
464
|
urn=chart_urn,
|
|
465
465
|
aspects=[],
|
datahub/ingestion/source/mode.py
CHANGED
|
@@ -377,7 +377,7 @@ class ModeSource(StatefulIngestionSourceBase):
|
|
|
377
377
|
]
|
|
378
378
|
|
|
379
379
|
def _dashboard_urn(self, report_info: dict) -> str:
|
|
380
|
-
return builder.make_dashboard_urn(self.platform, report_info.get("id", ""))
|
|
380
|
+
return builder.make_dashboard_urn(self.platform, str(report_info.get("id", "")))
|
|
381
381
|
|
|
382
382
|
def _parse_last_run_at(self, report_info: dict) -> Optional[int]:
|
|
383
383
|
# Mode queries are refreshed, and that timestamp is reflected correctly here.
|
|
@@ -16,10 +16,13 @@ from datahub.ingestion.api.decorators import (
|
|
|
16
16
|
support_status,
|
|
17
17
|
)
|
|
18
18
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
|
19
|
-
StaleEntityRemovalSourceReport,
|
|
20
19
|
StatefulStaleMetadataRemovalConfig,
|
|
21
20
|
)
|
|
22
|
-
from datahub.ingestion.source.superset import
|
|
21
|
+
from datahub.ingestion.source.superset import (
|
|
22
|
+
SupersetConfig,
|
|
23
|
+
SupersetSource,
|
|
24
|
+
SupersetSourceReport,
|
|
25
|
+
)
|
|
23
26
|
from datahub.utilities import config_clean
|
|
24
27
|
|
|
25
28
|
logger = logging.getLogger(__name__)
|
|
@@ -76,7 +79,7 @@ class PresetSource(SupersetSource):
|
|
|
76
79
|
"""
|
|
77
80
|
|
|
78
81
|
config: PresetConfig
|
|
79
|
-
report:
|
|
82
|
+
report: SupersetSourceReport
|
|
80
83
|
platform = "preset"
|
|
81
84
|
|
|
82
85
|
def __init__(self, ctx: PipelineContext, config: PresetConfig):
|
|
@@ -84,7 +87,7 @@ class PresetSource(SupersetSource):
|
|
|
84
87
|
|
|
85
88
|
super().__init__(ctx, config)
|
|
86
89
|
self.config = config
|
|
87
|
-
self.report =
|
|
90
|
+
self.report = SupersetSourceReport()
|
|
88
91
|
self.platform = "preset"
|
|
89
92
|
|
|
90
93
|
def login(self):
|