acryl-datahub 1.0.0.1rc3__py3-none-any.whl → 1.0.0.1rc4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/METADATA +2329 -2329
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/RECORD +17 -17
- datahub/_version.py +1 -1
- datahub/ingestion/graph/filters.py +22 -2
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/redshift/lineage_v2.py +7 -0
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/superset.py +153 -13
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/main_client.py +2 -1
- datahub/sdk/search_filters.py +18 -23
- datahub/sql_parsing/split_statements.py +12 -2
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.1rc3.dist-info → acryl_datahub-1.0.0.1rc4.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
acryl_datahub-1.0.0.
|
|
1
|
+
acryl_datahub-1.0.0.1rc4.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
|
|
2
2
|
datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
|
|
3
3
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
4
|
-
datahub/_version.py,sha256=
|
|
4
|
+
datahub/_version.py,sha256=C5PxZfTY1_MHATsJ5uiJ0n1KBC0rumbfeq67GwRBzYQ,323
|
|
5
5
|
datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
|
|
6
6
|
datahub/errors.py,sha256=w6h8b27j9XlmPbTwqpu7-wgiTrXlHzcnUOnJ_iOrwzo,520
|
|
7
7
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -175,7 +175,7 @@ datahub/ingestion/graph/client.py,sha256=rdX2DXqTXyLyS1_qiUzc3zzIE8CFheP2pYi1I68
|
|
|
175
175
|
datahub/ingestion/graph/config.py,sha256=_oha8Je7P80ZmrkZUAaRHyYbdMmTkMI5JkYjEP2Ri1Q,751
|
|
176
176
|
datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
|
|
177
177
|
datahub/ingestion/graph/entity_versioning.py,sha256=nrcNz0Qm6kpE6oTu_mrYUQDx14KPspBTc6R9SyFUY6c,6901
|
|
178
|
-
datahub/ingestion/graph/filters.py,sha256=
|
|
178
|
+
datahub/ingestion/graph/filters.py,sha256=VFZKmef7ay1sQ5zRDDC1M_i6T96VzIgs-FzMs5eibiQ,7347
|
|
179
179
|
datahub/ingestion/reporting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
180
180
|
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py,sha256=iEulcZMLBQuUfe9MAYyobMekvMcNm4dqVcS_C_2KfrI,9736
|
|
181
181
|
datahub/ingestion/reporting/file_reporter.py,sha256=tiWukmMxHrTQI3rOAumsq6lRlw8T6spqpS6XBDYnrZU,1640
|
|
@@ -200,7 +200,7 @@ datahub/ingestion/source/demo_data.py,sha256=PbtCHlZx3wrKlOPPgkWhDQuPm7ZfIx2neXJ
|
|
|
200
200
|
datahub/ingestion/source/elastic_search.py,sha256=2dwIcSbYMaq_RoSnxLGz4Q_20oJ8AGgMKunVIBIgYM8,23406
|
|
201
201
|
datahub/ingestion/source/feast.py,sha256=lsk0jc_0gKiiBNgmrmT8o8bvBOSLFPQMNrvWEcOye2w,18802
|
|
202
202
|
datahub/ingestion/source/file.py,sha256=h6CRH7hrKcFxu1SmZDjqJcJUSrc031u5oJUl2clnPO4,15976
|
|
203
|
-
datahub/ingestion/source/ge_data_profiler.py,sha256=
|
|
203
|
+
datahub/ingestion/source/ge_data_profiler.py,sha256=gQDHVA6-yYtg1dwH6D7RJGcCPDbE61BAkJ5i4-KHD1g,65092
|
|
204
204
|
datahub/ingestion/source/ge_profiling_config.py,sha256=FlWfXoVoayabVXNMB9qETEU0GX0az6HYqNUZRnIu_fQ,10866
|
|
205
205
|
datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
|
|
206
206
|
datahub/ingestion/source/ldap.py,sha256=CNr3foofIpoCXu_GGqfcajlQE2qkHr5isYwVcDutdkk,18695
|
|
@@ -217,7 +217,7 @@ datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99Wd
|
|
|
217
217
|
datahub/ingestion/source/salesforce.py,sha256=CQtDFv1OsbC1vyzNbKOc6GxhFQ5GdYj45hgAF0-oIcw,40487
|
|
218
218
|
datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
|
|
219
219
|
datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
|
|
220
|
-
datahub/ingestion/source/superset.py,sha256=
|
|
220
|
+
datahub/ingestion/source/superset.py,sha256=FRZ7cCURW6NHUOKaFicdAZq2caXektvO9rJE4tO9scU,40336
|
|
221
221
|
datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
222
222
|
datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
|
|
223
223
|
datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=Rkf64evufyVGPiE4VK8QAjzBiJFu85tOGMmJ0lJZ2Og,3600
|
|
@@ -318,7 +318,7 @@ datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
|
|
|
318
318
|
datahub/ingestion/source/gc/datahub_gc.py,sha256=EXO-Stj6gGMLTSTbSBC-C3_zpjpQtFN9pAMWR95ma0I,12830
|
|
319
319
|
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=mUWcMt-_FL1SYGIgI4lGZDZGXspUUTv__5GN1W2oJ3s,17118
|
|
320
320
|
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=y-9ZIs_DZPUzYH1CI6HmaAZg3olNNA7MjT8HrCqAI0k,11159
|
|
321
|
-
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=
|
|
321
|
+
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=4-qQR_2HGIYU8kC2hRIsJyKKMb9lKq4B6paJm_abUk4,12628
|
|
322
322
|
datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
323
323
|
datahub/ingestion/source/gcs/gcs_source.py,sha256=5EZkrDqjRNQz_aUL1MLp0PTFm0Ztubmk0NYJGZTRLjU,6276
|
|
324
324
|
datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
|
|
@@ -410,9 +410,9 @@ datahub/ingestion/source/redshift/config.py,sha256=l_hlgsCjvlcgcFQpd5WMKlW8nqQUh
|
|
|
410
410
|
datahub/ingestion/source/redshift/datashares.py,sha256=kH3YkoenOa59XZU12XeUf283lOOAITYD9jOXpy8R06E,9227
|
|
411
411
|
datahub/ingestion/source/redshift/exception.py,sha256=dxzYUIv5B_FAWhOuzG2u5We7FX-ar4jhOXPXAlEIvgM,2055
|
|
412
412
|
datahub/ingestion/source/redshift/lineage.py,sha256=IPF8vHy2MFyhK-hu2-lxV2-kcnNAEzltPLnnIvwIBMY,44100
|
|
413
|
-
datahub/ingestion/source/redshift/lineage_v2.py,sha256=
|
|
413
|
+
datahub/ingestion/source/redshift/lineage_v2.py,sha256=vQ2LBa04hqYqIRK0CP3VDYRlvMLAqodzdieDl6LipiQ,17909
|
|
414
414
|
datahub/ingestion/source/redshift/profile.py,sha256=dq7m9YG3TvEMbplwVIutUpzbXLPH8KIj9SuWNo7PWWE,4323
|
|
415
|
-
datahub/ingestion/source/redshift/query.py,sha256=
|
|
415
|
+
datahub/ingestion/source/redshift/query.py,sha256=vVIuNUaU4a7AfMFJZlgLuqi0cGVl0gVz8xZUSnPhWvs,47845
|
|
416
416
|
datahub/ingestion/source/redshift/redshift.py,sha256=whMujnJxwNT2ZXnOVRrZQiy317hlsvbARzabKmI3oN8,43536
|
|
417
417
|
datahub/ingestion/source/redshift/redshift_data_reader.py,sha256=zc69jwXHdF-w8J4Hq-ZQ6BjHQ75Ij2iNDMpoRJlcmlU,1724
|
|
418
418
|
datahub/ingestion/source/redshift/redshift_schema.py,sha256=7F-l_omOuKMuGE_rBWXVPG_GWXFKnCMzC4frNxZB9cs,24800
|
|
@@ -895,7 +895,7 @@ datahub/metadata/schemas/ViewProperties.avsc,sha256=3HhcbH5493dJUnEUtFMYMVfbYQ52
|
|
|
895
895
|
datahub/metadata/schemas/__init__.py,sha256=uvLNC3VyCkWA_v8e9FdA1leFf46NFKDD0AajCfihepI,581
|
|
896
896
|
datahub/pydantic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
897
897
|
datahub/pydantic/compat.py,sha256=TUEo4kSEeOWVAhV6LQtst1phrpVgGtK4uif4OI5vQ2M,1937
|
|
898
|
-
datahub/sdk/__init__.py,sha256=
|
|
898
|
+
datahub/sdk/__init__.py,sha256=QeutS6Th8K4E4ZxXuoGrmvahN6zA9Oh9asKk5mw9AIk,1670
|
|
899
899
|
datahub/sdk/_all_entities.py,sha256=lCa2oeudmrKN7suOBIT5tidGJB8pgL3xD80DQlsYUKI,399
|
|
900
900
|
datahub/sdk/_attribution.py,sha256=0Trh8steVd27GOr9MKCZeawbuDD2_q3GIsZlCtHqEUg,1321
|
|
901
901
|
datahub/sdk/_shared.py,sha256=pHVKEJ50BoLw0fLLAm9zYsynNDN_bPI26qlj8nk2iyY,19582
|
|
@@ -904,10 +904,10 @@ datahub/sdk/container.py,sha256=yw_vw9Jl1wOYNwMHxQHLz5ZvVQVDWWHi9CWBR3hOCd8,7547
|
|
|
904
904
|
datahub/sdk/dataset.py,sha256=5LG4c_8bHeSPYrW88KNXRgiPD8frBjR0OBVrrwdquU4,29152
|
|
905
905
|
datahub/sdk/entity.py,sha256=Q29AbpS58L4gD8ETwoNIwG-ouytz4c0MSSFi6-jLl_4,6742
|
|
906
906
|
datahub/sdk/entity_client.py,sha256=Sxe6H6Vr_tqLJu5KW7MJfLWJ6mgh4mbsx7u7MOBpM64,5052
|
|
907
|
-
datahub/sdk/main_client.py,sha256=
|
|
907
|
+
datahub/sdk/main_client.py,sha256=h2MKRhR-BO0zGCMhF7z2bTncX4hagKrAYwR3wTNTtzA,3666
|
|
908
908
|
datahub/sdk/resolver_client.py,sha256=nKMAZJt2tRSGfKSzoREIh43PXqjM3umLiYkYHJjo1io,3243
|
|
909
909
|
datahub/sdk/search_client.py,sha256=h9O_rsphkTdpd5hMPay3xSXfJM761cf4PjNCBwCnFzU,1309
|
|
910
|
-
datahub/sdk/search_filters.py,sha256=
|
|
910
|
+
datahub/sdk/search_filters.py,sha256=WaJKFUKT9P70NBkh36f44rZrJ7zRJpRwu8mN-rEx5y0,11364
|
|
911
911
|
datahub/secret/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
912
912
|
datahub/secret/datahub_secret_store.py,sha256=9u9S87-15jwhj4h0EsAVIMdQLgvstKc8voQux2slxgU,2477
|
|
913
913
|
datahub/secret/datahub_secrets_client.py,sha256=nDmhziKdvseJHlaDVUcAwK8Fv8maeAaG-ktZtWG2b70,1316
|
|
@@ -933,7 +933,7 @@ datahub/sql_parsing/_sqlglot_patch.py,sha256=dpTiGj4Jxzvbi6HkYiOTbZ81CTFwEEo6JgY
|
|
|
933
933
|
datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn0,1751
|
|
934
934
|
datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
|
|
935
935
|
datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
|
|
936
|
-
datahub/sql_parsing/split_statements.py,sha256=
|
|
936
|
+
datahub/sql_parsing/split_statements.py,sha256=OIQXA9e4k3G9Z1y7rbgdtZhMWt4FPnq41cE8Jkm9cBY,9542
|
|
937
937
|
datahub/sql_parsing/sql_parsing_aggregator.py,sha256=A3_0wSxBJSRowEaslptDpBoKO42XXx5UyTEK9PkekIs,70317
|
|
938
938
|
datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
|
|
939
939
|
datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
|
|
@@ -1043,8 +1043,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
1043
1043
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
1044
1044
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
1045
1045
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
1046
|
-
acryl_datahub-1.0.0.
|
|
1047
|
-
acryl_datahub-1.0.0.
|
|
1048
|
-
acryl_datahub-1.0.0.
|
|
1049
|
-
acryl_datahub-1.0.0.
|
|
1050
|
-
acryl_datahub-1.0.0.
|
|
1046
|
+
acryl_datahub-1.0.0.1rc4.dist-info/METADATA,sha256=0QZSNfWv2u7u7GcupcTXvYmmBOqeB7vfGNKHUyQEoNs,176849
|
|
1047
|
+
acryl_datahub-1.0.0.1rc4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
1048
|
+
acryl_datahub-1.0.0.1rc4.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
|
|
1049
|
+
acryl_datahub-1.0.0.1rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
1050
|
+
acryl_datahub-1.0.0.1rc4.dist-info/RECORD,,
|
datahub/_version.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import enum
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Literal, Optional
|
|
4
|
+
|
|
5
|
+
from typing_extensions import TypeAlias
|
|
4
6
|
|
|
5
7
|
from datahub.emitter.mce_builder import (
|
|
6
8
|
make_data_platform_urn,
|
|
@@ -10,11 +12,29 @@ from datahub.utilities.urns.urn import guess_entity_type
|
|
|
10
12
|
|
|
11
13
|
RawSearchFilterRule = Dict[str, Any]
|
|
12
14
|
|
|
15
|
+
# Mirrors our GraphQL enum: https://datahubproject.io/docs/graphql/enums#filteroperator
|
|
16
|
+
FilterOperator: TypeAlias = Literal[
|
|
17
|
+
"CONTAIN",
|
|
18
|
+
"EQUAL",
|
|
19
|
+
"IEQUAL",
|
|
20
|
+
"IN",
|
|
21
|
+
"EXISTS",
|
|
22
|
+
"GREATER_THAN",
|
|
23
|
+
"GREATER_THAN_OR_EQUAL_TO",
|
|
24
|
+
"LESS_THAN",
|
|
25
|
+
"LESS_THAN_OR_EQUAL_TO",
|
|
26
|
+
"START_WITH",
|
|
27
|
+
"END_WITH",
|
|
28
|
+
"DESCENDANTS_INCL",
|
|
29
|
+
"ANCESTORS_INCL",
|
|
30
|
+
"RELATED_INCL",
|
|
31
|
+
]
|
|
32
|
+
|
|
13
33
|
|
|
14
34
|
@dataclasses.dataclass
|
|
15
35
|
class SearchFilterRule:
|
|
16
36
|
field: str
|
|
17
|
-
condition:
|
|
37
|
+
condition: FilterOperator
|
|
18
38
|
values: List[str]
|
|
19
39
|
negated: bool = False
|
|
20
40
|
|
|
@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
|
|
|
12
12
|
from datahub.ingestion.api.common import PipelineContext
|
|
13
13
|
from datahub.ingestion.api.source import SourceReport
|
|
14
14
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
15
|
-
from datahub.ingestion.graph.filters import RemovedStatusFilter
|
|
15
|
+
from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
|
|
16
16
|
from datahub.utilities.lossy_collections import LossyList
|
|
17
17
|
from datahub.utilities.stats_collections import TopKDict
|
|
18
18
|
from datahub.utilities.urns._urn_base import Urn
|
|
19
|
+
from datahub.utilities.urns.error import InvalidUrnError
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
22
|
-
QUERY_ENTITIES = """
|
|
23
|
-
query listEntities($input: ScrollAcrossEntitiesInput!) {
|
|
24
|
-
scrollAcrossEntities(input: $input) {
|
|
25
|
-
nextScrollId
|
|
26
|
-
count
|
|
27
|
-
searchResults {
|
|
28
|
-
entity {
|
|
29
|
-
... on QueryEntity {
|
|
30
|
-
urn
|
|
31
|
-
}
|
|
32
|
-
... on DataProcessInstance {
|
|
33
|
-
urn
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
23
|
|
|
42
24
|
class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
43
25
|
enabled: bool = Field(
|
|
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
64
46
|
)
|
|
65
47
|
|
|
66
48
|
entity_types: Optional[List[str]] = Field(
|
|
67
|
-
default
|
|
49
|
+
# A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
|
|
50
|
+
default=[
|
|
51
|
+
"dataset",
|
|
52
|
+
"dashboard",
|
|
53
|
+
"chart",
|
|
54
|
+
"mlmodel",
|
|
55
|
+
"mlmodelGroup",
|
|
56
|
+
"mlfeatureTable",
|
|
57
|
+
"mlfeature",
|
|
58
|
+
"mlprimaryKey",
|
|
59
|
+
"dataFlow",
|
|
60
|
+
"dataJob",
|
|
61
|
+
"glossaryTerm",
|
|
62
|
+
"glossaryNode",
|
|
63
|
+
"tag",
|
|
64
|
+
"role",
|
|
65
|
+
"corpuser",
|
|
66
|
+
"corpGroup",
|
|
67
|
+
"container",
|
|
68
|
+
"domain",
|
|
69
|
+
"dataProduct",
|
|
70
|
+
"notebook",
|
|
71
|
+
"businessAttribute",
|
|
72
|
+
"schemaField",
|
|
73
|
+
"query",
|
|
74
|
+
"dataProcessInstance",
|
|
75
|
+
],
|
|
68
76
|
description="List of entity types to cleanup",
|
|
69
77
|
)
|
|
70
78
|
|
|
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
|
|
|
103
111
|
num_entities_found: Dict[str, int] = field(default_factory=dict)
|
|
104
112
|
num_soft_deleted_entity_processed: int = 0
|
|
105
113
|
num_soft_deleted_retained_due_to_age: int = 0
|
|
114
|
+
num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
|
|
115
|
+
default_factory=TopKDict
|
|
116
|
+
)
|
|
106
117
|
num_soft_deleted_entity_removal_started: int = 0
|
|
107
118
|
num_hard_deleted: int = 0
|
|
108
119
|
num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
|
|
|
111
122
|
)
|
|
112
123
|
runtime_limit_reached: bool = False
|
|
113
124
|
deletion_limit_reached: bool = False
|
|
125
|
+
num_soft_deleted_entity_found: int = 0
|
|
126
|
+
num_soft_deleted_entity_invalid_urn: int = 0
|
|
114
127
|
|
|
115
128
|
|
|
116
129
|
class SoftDeletedEntitiesCleanup:
|
|
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
133
146
|
self.config = config
|
|
134
147
|
self.report = report
|
|
135
148
|
self.dry_run = dry_run
|
|
136
|
-
self.start_time =
|
|
149
|
+
self.start_time = time.time()
|
|
137
150
|
self._report_lock: Lock = Lock()
|
|
138
151
|
self.last_print_time = 0.0
|
|
139
152
|
|
|
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
|
|
|
142
155
|
with self._report_lock:
|
|
143
156
|
self.report.num_soft_deleted_retained_due_to_age += 1
|
|
144
157
|
|
|
158
|
+
def _increment_retained_by_type(self, type: str) -> None:
|
|
159
|
+
"""Thread-safe method to update report fields"""
|
|
160
|
+
with self._report_lock:
|
|
161
|
+
self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
|
|
162
|
+
self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
|
|
163
|
+
+ 1
|
|
164
|
+
)
|
|
165
|
+
|
|
145
166
|
def _increment_removal_started_count(self) -> None:
|
|
146
167
|
"""Thread-safe method to update report fields"""
|
|
147
168
|
with self._report_lock:
|
|
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
|
|
|
160
181
|
)
|
|
161
182
|
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
162
183
|
|
|
163
|
-
def delete_entity(self, urn:
|
|
184
|
+
def delete_entity(self, urn: Urn) -> None:
|
|
164
185
|
assert self.ctx.graph
|
|
165
186
|
|
|
166
|
-
entity_urn = Urn.from_string(urn)
|
|
167
187
|
if self.dry_run:
|
|
168
188
|
logger.info(
|
|
169
189
|
f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
|
|
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
|
|
|
172
192
|
if self._deletion_limit_reached() or self._times_up():
|
|
173
193
|
return
|
|
174
194
|
self._increment_removal_started_count()
|
|
175
|
-
self.ctx.graph.delete_entity(urn=urn, hard=True)
|
|
195
|
+
self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
|
|
176
196
|
self.ctx.graph.delete_references_to_urn(
|
|
177
|
-
urn=urn,
|
|
197
|
+
urn=urn.urn(),
|
|
178
198
|
dry_run=False,
|
|
179
199
|
)
|
|
180
|
-
self._update_report(urn,
|
|
200
|
+
self._update_report(urn.urn(), urn.entity_type)
|
|
181
201
|
|
|
182
|
-
def delete_soft_deleted_entity(self, urn:
|
|
202
|
+
def delete_soft_deleted_entity(self, urn: Urn) -> None:
|
|
183
203
|
assert self.ctx.graph
|
|
184
204
|
|
|
185
205
|
retention_time = (
|
|
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
187
207
|
- self.config.retention_days * 24 * 60 * 60
|
|
188
208
|
)
|
|
189
209
|
|
|
190
|
-
aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
|
|
210
|
+
aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
|
|
191
211
|
if "status" in aspect["aspects"]:
|
|
192
212
|
if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
|
|
193
213
|
"status"
|
|
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
196
216
|
self.delete_entity(urn)
|
|
197
217
|
else:
|
|
198
218
|
self._increment_retained_count()
|
|
219
|
+
self._increment_retained_by_type(urn.entity_type)
|
|
199
220
|
|
|
200
221
|
def _print_report(self) -> None:
|
|
201
222
|
time_taken = round(time.time() - self.last_print_time, 1)
|
|
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
204
225
|
self.last_print_time = time.time()
|
|
205
226
|
logger.info(f"\n{self.report.as_string()}")
|
|
206
227
|
|
|
207
|
-
def _process_futures(self, futures: Dict[Future,
|
|
228
|
+
def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
|
|
208
229
|
done, not_done = wait(futures, return_when=FIRST_COMPLETED)
|
|
209
230
|
futures = {future: urn for future, urn in futures.items() if future in not_done}
|
|
210
231
|
|
|
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
214
235
|
self.report.failure(
|
|
215
236
|
title="Failed to delete entity",
|
|
216
237
|
message="Failed to delete entity",
|
|
217
|
-
context=futures[future],
|
|
238
|
+
context=futures[future].urn(),
|
|
218
239
|
exc=future.exception(),
|
|
219
240
|
)
|
|
220
241
|
self.report.num_soft_deleted_entity_processed += 1
|
|
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
|
|
|
229
250
|
time.sleep(self.config.delay)
|
|
230
251
|
return futures
|
|
231
252
|
|
|
232
|
-
def
|
|
253
|
+
def _get_urns(self) -> Iterable[str]:
|
|
233
254
|
assert self.ctx.graph
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
# to avoid a giant stacktrace by having a smaller batch size in first call
|
|
240
|
-
# This will be remove in future version after server with fix has been
|
|
241
|
-
# around for a while
|
|
242
|
-
batch_size = 10
|
|
243
|
-
|
|
244
|
-
while True:
|
|
245
|
-
try:
|
|
246
|
-
if entity_type not in self.report.num_calls_made:
|
|
247
|
-
self.report.num_calls_made[entity_type] = 1
|
|
248
|
-
else:
|
|
249
|
-
self.report.num_calls_made[entity_type] += 1
|
|
250
|
-
self._print_report()
|
|
251
|
-
result = self.ctx.graph.execute_graphql(
|
|
252
|
-
graphql_query,
|
|
253
|
-
{
|
|
254
|
-
"input": {
|
|
255
|
-
"types": [entity_type],
|
|
256
|
-
"query": "*",
|
|
257
|
-
"scrollId": scroll_id if scroll_id else None,
|
|
258
|
-
"count": batch_size,
|
|
259
|
-
"orFilters": [
|
|
260
|
-
{
|
|
261
|
-
"and": [
|
|
262
|
-
{
|
|
263
|
-
"field": "removed",
|
|
264
|
-
"values": ["true"],
|
|
265
|
-
"condition": "EQUAL",
|
|
266
|
-
}
|
|
267
|
-
]
|
|
268
|
-
}
|
|
269
|
-
],
|
|
270
|
-
}
|
|
271
|
-
},
|
|
272
|
-
)
|
|
273
|
-
except Exception as e:
|
|
274
|
-
self.report.failure(
|
|
275
|
-
f"While trying to get {entity_type} with {scroll_id}", exc=e
|
|
276
|
-
)
|
|
277
|
-
break
|
|
278
|
-
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
279
|
-
if not scroll_across_entities:
|
|
280
|
-
break
|
|
281
|
-
search_results = scroll_across_entities.get("searchResults")
|
|
282
|
-
count = scroll_across_entities.get("count")
|
|
283
|
-
if not count or not search_results:
|
|
284
|
-
# Due to a server bug we cannot rely on just count as it was returning response like this
|
|
285
|
-
# {'count': 1, 'nextScrollId': None, 'searchResults': []}
|
|
286
|
-
break
|
|
287
|
-
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
288
|
-
# Temp workaround. See note in beginning of the function
|
|
289
|
-
# We make the batch size = config after call has succeeded once
|
|
290
|
-
batch_size = self.config.batch_size
|
|
291
|
-
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
292
|
-
if entity_type not in self.report.num_entities_found:
|
|
293
|
-
self.report.num_entities_found[entity_type] = 0
|
|
294
|
-
self.report.num_entities_found[entity_type] += scroll_across_entities.get(
|
|
295
|
-
"count"
|
|
255
|
+
# Entities created in the retention period are not considered for deletion
|
|
256
|
+
created_from = int(
|
|
257
|
+
(
|
|
258
|
+
datetime.now(timezone.utc).timestamp()
|
|
259
|
+
- self.config.retention_days * 24 * 60 * 60
|
|
296
260
|
)
|
|
297
|
-
|
|
298
|
-
|
|
261
|
+
* 1000
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
entity_types = self.config.entity_types
|
|
265
|
+
# dataProcessInstance is a special case where we need to get the entities separately
|
|
266
|
+
# because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
|
|
267
|
+
# Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
|
|
268
|
+
if (
|
|
269
|
+
self.config.entity_types
|
|
270
|
+
and "dataProcessInstance" in self.config.entity_types
|
|
271
|
+
):
|
|
272
|
+
entity_types = self.config.entity_types.copy()
|
|
273
|
+
yield from self.ctx.graph.get_urns_by_filter(
|
|
274
|
+
entity_types=["dataProcessInstance"],
|
|
275
|
+
platform=self.config.platform,
|
|
276
|
+
env=self.config.env,
|
|
277
|
+
query=self.config.query,
|
|
278
|
+
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
279
|
+
batch_size=self.config.batch_size,
|
|
280
|
+
extraFilters=[
|
|
281
|
+
SearchFilterRule(
|
|
282
|
+
field="created",
|
|
283
|
+
condition="LESS_THAN",
|
|
284
|
+
values=[f"{created_from}"],
|
|
285
|
+
).to_raw()
|
|
286
|
+
],
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
entity_types.remove("dataProcessInstance")
|
|
299
290
|
|
|
300
|
-
def _get_urns(self) -> Iterable[str]:
|
|
301
|
-
assert self.ctx.graph
|
|
302
291
|
yield from self.ctx.graph.get_urns_by_filter(
|
|
303
|
-
entity_types=
|
|
292
|
+
entity_types=entity_types,
|
|
304
293
|
platform=self.config.platform,
|
|
305
294
|
env=self.config.env,
|
|
306
295
|
query=self.config.query,
|
|
307
296
|
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
308
297
|
batch_size=self.config.batch_size,
|
|
309
298
|
)
|
|
310
|
-
yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
|
|
311
|
-
yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
|
|
312
299
|
|
|
313
300
|
def _times_up(self) -> bool:
|
|
314
301
|
if (
|
|
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
|
|
|
335
322
|
return
|
|
336
323
|
self.start_time = time.time()
|
|
337
324
|
|
|
338
|
-
futures: Dict[Future,
|
|
325
|
+
futures: Dict[Future, Urn] = dict()
|
|
339
326
|
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
340
327
|
for urn in self._get_urns():
|
|
328
|
+
try:
|
|
329
|
+
self.report.num_soft_deleted_entity_found += 1
|
|
330
|
+
soft_deleted_urn = Urn.from_string(urn)
|
|
331
|
+
except InvalidUrnError as e:
|
|
332
|
+
logger.error(f"Failed to parse urn {urn} with error {e}")
|
|
333
|
+
self.report.num_soft_deleted_entity_invalid_urn += 1
|
|
334
|
+
continue
|
|
335
|
+
|
|
341
336
|
self._print_report()
|
|
342
337
|
while len(futures) >= self.config.futures_max_at_time:
|
|
343
338
|
futures = self._process_futures(futures)
|
|
344
339
|
if self._deletion_limit_reached() or self._times_up():
|
|
345
340
|
break
|
|
346
|
-
future = executor.submit(
|
|
347
|
-
|
|
341
|
+
future = executor.submit(
|
|
342
|
+
self.delete_soft_deleted_entity, soft_deleted_urn
|
|
343
|
+
)
|
|
344
|
+
futures[future] = soft_deleted_urn
|
|
348
345
|
|
|
349
346
|
logger.info(f"Waiting for {len(futures)} futures to complete")
|
|
350
347
|
while len(futures) > 0:
|
|
@@ -602,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
602
602
|
if not self.config.include_field_median_value:
|
|
603
603
|
return
|
|
604
604
|
try:
|
|
605
|
-
if self.dataset.engine.dialect.name.lower()
|
|
605
|
+
if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
|
|
606
606
|
column_profile.median = str(
|
|
607
607
|
self.dataset.engine.execute(
|
|
608
608
|
sa.select([sa.func.median(sa.column(column))]).select_from(
|
|
@@ -610,6 +610,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
610
610
|
)
|
|
611
611
|
).scalar()
|
|
612
612
|
)
|
|
613
|
+
elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
|
|
614
|
+
column_profile.median = str(
|
|
615
|
+
self.dataset.engine.execute(
|
|
616
|
+
sa.select(
|
|
617
|
+
sa.text(
|
|
618
|
+
f"approx_percentile(`{column}`, 0.5) as approx_median"
|
|
619
|
+
)
|
|
620
|
+
).select_from(self.dataset._table)
|
|
621
|
+
).scalar()
|
|
622
|
+
)
|
|
613
623
|
elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
|
|
614
624
|
column_profile.median = str(
|
|
615
625
|
self.dataset.engine.execute(
|
|
@@ -400,6 +400,10 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
400
400
|
db_schemas: Dict[str, Dict[str, RedshiftSchema]],
|
|
401
401
|
) -> None:
|
|
402
402
|
for schema_name, tables in all_tables[self.database].items():
|
|
403
|
+
logger.info(f"External table lineage: checking schema {schema_name}")
|
|
404
|
+
if not db_schemas[self.database].get(schema_name):
|
|
405
|
+
logger.warning(f"Schema {schema_name} not found")
|
|
406
|
+
continue
|
|
403
407
|
for table in tables:
|
|
404
408
|
schema = db_schemas[self.database][schema_name]
|
|
405
409
|
if (
|
|
@@ -407,6 +411,9 @@ class RedshiftSqlLineageV2(Closeable):
|
|
|
407
411
|
and schema.is_external_schema()
|
|
408
412
|
and schema.external_platform
|
|
409
413
|
):
|
|
414
|
+
logger.info(
|
|
415
|
+
f"External table lineage: processing table {schema_name}.{table.name}"
|
|
416
|
+
)
|
|
410
417
|
# external_db_params = schema.option
|
|
411
418
|
upstream_platform = schema.external_platform.lower()
|
|
412
419
|
|
|
@@ -44,7 +44,7 @@ class RedshiftCommonQuery:
|
|
|
44
44
|
SELECT
|
|
45
45
|
schema_name,
|
|
46
46
|
schema_type,
|
|
47
|
-
schema_option,
|
|
47
|
+
cast(null as varchar(1024)) as schema_option,
|
|
48
48
|
cast(null as varchar(256)) as external_platform,
|
|
49
49
|
cast(null as varchar(256)) as external_database
|
|
50
50
|
FROM svv_redshift_schemas
|