acryl-datahub 1.0.0.1rc3__py3-none-any.whl → 1.0.0.1rc4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
- acryl_datahub-1.0.0.1rc3.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
1
+ acryl_datahub-1.0.0.1rc4.dist-info/licenses/LICENSE,sha256=9xNHpsD0uYF5ONzXsKDCuHHB-xbiCrSbueWXqrTNsxk,11365
2
2
  datahub/__init__.py,sha256=aq_i5lVREmoLfYIqcx_pEQicO855YlhD19tWc1eZZNI,59
3
3
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
4
- datahub/_version.py,sha256=ZzXmZcaAOZUvHmN9D5crLfATvvjGO_NN7ItKlRqtLD0,323
4
+ datahub/_version.py,sha256=C5PxZfTY1_MHATsJ5uiJ0n1KBC0rumbfeq67GwRBzYQ,323
5
5
  datahub/entrypoints.py,sha256=2TYgHhs3sCxJlojIHjqfxzt3_ImPwPzq4vBtsUuMqu4,8885
6
6
  datahub/errors.py,sha256=w6h8b27j9XlmPbTwqpu7-wgiTrXlHzcnUOnJ_iOrwzo,520
7
7
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -175,7 +175,7 @@ datahub/ingestion/graph/client.py,sha256=rdX2DXqTXyLyS1_qiUzc3zzIE8CFheP2pYi1I68
175
175
  datahub/ingestion/graph/config.py,sha256=_oha8Je7P80ZmrkZUAaRHyYbdMmTkMI5JkYjEP2Ri1Q,751
176
176
  datahub/ingestion/graph/connections.py,sha256=9462L0ZWGKURyypAln25eMPhK3pcufBar9tNDoqspXs,741
177
177
  datahub/ingestion/graph/entity_versioning.py,sha256=nrcNz0Qm6kpE6oTu_mrYUQDx14KPspBTc6R9SyFUY6c,6901
178
- datahub/ingestion/graph/filters.py,sha256=TL9JDVhpzKLfKf0m9vvzp3XCg3hecElaYRh0rajYfM8,6922
178
+ datahub/ingestion/graph/filters.py,sha256=VFZKmef7ay1sQ5zRDDC1M_i6T96VzIgs-FzMs5eibiQ,7347
179
179
  datahub/ingestion/reporting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
180
180
  datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py,sha256=iEulcZMLBQuUfe9MAYyobMekvMcNm4dqVcS_C_2KfrI,9736
181
181
  datahub/ingestion/reporting/file_reporter.py,sha256=tiWukmMxHrTQI3rOAumsq6lRlw8T6spqpS6XBDYnrZU,1640
@@ -200,7 +200,7 @@ datahub/ingestion/source/demo_data.py,sha256=PbtCHlZx3wrKlOPPgkWhDQuPm7ZfIx2neXJ
200
200
  datahub/ingestion/source/elastic_search.py,sha256=2dwIcSbYMaq_RoSnxLGz4Q_20oJ8AGgMKunVIBIgYM8,23406
201
201
  datahub/ingestion/source/feast.py,sha256=lsk0jc_0gKiiBNgmrmT8o8bvBOSLFPQMNrvWEcOye2w,18802
202
202
  datahub/ingestion/source/file.py,sha256=h6CRH7hrKcFxu1SmZDjqJcJUSrc031u5oJUl2clnPO4,15976
203
- datahub/ingestion/source/ge_data_profiler.py,sha256=uPHHgOr-fHXsdGonVa8Lc8ZE1yo0TyVWKhktwAuA3fI,64642
203
+ datahub/ingestion/source/ge_data_profiler.py,sha256=gQDHVA6-yYtg1dwH6D7RJGcCPDbE61BAkJ5i4-KHD1g,65092
204
204
  datahub/ingestion/source/ge_profiling_config.py,sha256=FlWfXoVoayabVXNMB9qETEU0GX0az6HYqNUZRnIu_fQ,10866
205
205
  datahub/ingestion/source/glue_profiling_config.py,sha256=vpMJH4Lf_qgR32BZy58suabri1yV5geaAPjzg2eORDc,2559
206
206
  datahub/ingestion/source/ldap.py,sha256=CNr3foofIpoCXu_GGqfcajlQE2qkHr5isYwVcDutdkk,18695
@@ -217,7 +217,7 @@ datahub/ingestion/source/redash.py,sha256=YxjSad-X_wPmxYH8dJmFz_VCFhiLTCTSlK99Wd
217
217
  datahub/ingestion/source/salesforce.py,sha256=CQtDFv1OsbC1vyzNbKOc6GxhFQ5GdYj45hgAF0-oIcw,40487
218
218
  datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
219
219
  datahub/ingestion/source/sql_queries.py,sha256=Ip7UZub7fgMh7P5jL_zJPY7lSkc9GGTy8GJ8lqZrcsE,9502
220
- datahub/ingestion/source/superset.py,sha256=QwgEeHYD2d_o5fg4bYKJLgc1UFXM_xjMcvvwWkLbAb8,35352
220
+ datahub/ingestion/source/superset.py,sha256=FRZ7cCURW6NHUOKaFicdAZq2caXektvO9rJE4tO9scU,40336
221
221
  datahub/ingestion/source/abs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
222
222
  datahub/ingestion/source/abs/config.py,sha256=mBQe0JTaP-Rcv4HnMUUySoYbSr4r3jDEMioxaXHnxXU,6709
223
223
  datahub/ingestion/source/abs/datalake_profiler_config.py,sha256=Rkf64evufyVGPiE4VK8QAjzBiJFu85tOGMmJ0lJZ2Og,3600
@@ -318,7 +318,7 @@ datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
318
318
  datahub/ingestion/source/gc/datahub_gc.py,sha256=EXO-Stj6gGMLTSTbSBC-C3_zpjpQtFN9pAMWR95ma0I,12830
319
319
  datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=mUWcMt-_FL1SYGIgI4lGZDZGXspUUTv__5GN1W2oJ3s,17118
320
320
  datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=y-9ZIs_DZPUzYH1CI6HmaAZg3olNNA7MjT8HrCqAI0k,11159
321
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=_oTXN0fzB4kYyFclah9X_1ds32bLayQyyWgoPeHQMw4,12923
321
+ datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=4-qQR_2HGIYU8kC2hRIsJyKKMb9lKq4B6paJm_abUk4,12628
322
322
  datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
323
323
  datahub/ingestion/source/gcs/gcs_source.py,sha256=5EZkrDqjRNQz_aUL1MLp0PTFm0Ztubmk0NYJGZTRLjU,6276
324
324
  datahub/ingestion/source/gcs/gcs_utils.py,sha256=_78KM863XXgkVLmZLtYGF5PJNnZas1go-XRtOq-79lo,1047
@@ -410,9 +410,9 @@ datahub/ingestion/source/redshift/config.py,sha256=l_hlgsCjvlcgcFQpd5WMKlW8nqQUh
410
410
  datahub/ingestion/source/redshift/datashares.py,sha256=kH3YkoenOa59XZU12XeUf283lOOAITYD9jOXpy8R06E,9227
411
411
  datahub/ingestion/source/redshift/exception.py,sha256=dxzYUIv5B_FAWhOuzG2u5We7FX-ar4jhOXPXAlEIvgM,2055
412
412
  datahub/ingestion/source/redshift/lineage.py,sha256=IPF8vHy2MFyhK-hu2-lxV2-kcnNAEzltPLnnIvwIBMY,44100
413
- datahub/ingestion/source/redshift/lineage_v2.py,sha256=ZMxPmmZ-O-Fid6VqnaUt6FyLSPHY8LXESYLj8fTZy1g,17523
413
+ datahub/ingestion/source/redshift/lineage_v2.py,sha256=vQ2LBa04hqYqIRK0CP3VDYRlvMLAqodzdieDl6LipiQ,17909
414
414
  datahub/ingestion/source/redshift/profile.py,sha256=dq7m9YG3TvEMbplwVIutUpzbXLPH8KIj9SuWNo7PWWE,4323
415
- datahub/ingestion/source/redshift/query.py,sha256=rkWEpxW7HVCtcMQLQ5hAYenE_4q4884B4lL67OULbuo,47814
415
+ datahub/ingestion/source/redshift/query.py,sha256=vVIuNUaU4a7AfMFJZlgLuqi0cGVl0gVz8xZUSnPhWvs,47845
416
416
  datahub/ingestion/source/redshift/redshift.py,sha256=whMujnJxwNT2ZXnOVRrZQiy317hlsvbARzabKmI3oN8,43536
417
417
  datahub/ingestion/source/redshift/redshift_data_reader.py,sha256=zc69jwXHdF-w8J4Hq-ZQ6BjHQ75Ij2iNDMpoRJlcmlU,1724
418
418
  datahub/ingestion/source/redshift/redshift_schema.py,sha256=7F-l_omOuKMuGE_rBWXVPG_GWXFKnCMzC4frNxZB9cs,24800
@@ -895,7 +895,7 @@ datahub/metadata/schemas/ViewProperties.avsc,sha256=3HhcbH5493dJUnEUtFMYMVfbYQ52
895
895
  datahub/metadata/schemas/__init__.py,sha256=uvLNC3VyCkWA_v8e9FdA1leFf46NFKDD0AajCfihepI,581
896
896
  datahub/pydantic/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
897
897
  datahub/pydantic/compat.py,sha256=TUEo4kSEeOWVAhV6LQtst1phrpVgGtK4uif4OI5vQ2M,1937
898
- datahub/sdk/__init__.py,sha256=FRnyQWEKsNSuqgVfxFg7afBWp8_9Yr19tKp3PySg_dg,1611
898
+ datahub/sdk/__init__.py,sha256=QeutS6Th8K4E4ZxXuoGrmvahN6zA9Oh9asKk5mw9AIk,1670
899
899
  datahub/sdk/_all_entities.py,sha256=lCa2oeudmrKN7suOBIT5tidGJB8pgL3xD80DQlsYUKI,399
900
900
  datahub/sdk/_attribution.py,sha256=0Trh8steVd27GOr9MKCZeawbuDD2_q3GIsZlCtHqEUg,1321
901
901
  datahub/sdk/_shared.py,sha256=pHVKEJ50BoLw0fLLAm9zYsynNDN_bPI26qlj8nk2iyY,19582
@@ -904,10 +904,10 @@ datahub/sdk/container.py,sha256=yw_vw9Jl1wOYNwMHxQHLz5ZvVQVDWWHi9CWBR3hOCd8,7547
904
904
  datahub/sdk/dataset.py,sha256=5LG4c_8bHeSPYrW88KNXRgiPD8frBjR0OBVrrwdquU4,29152
905
905
  datahub/sdk/entity.py,sha256=Q29AbpS58L4gD8ETwoNIwG-ouytz4c0MSSFi6-jLl_4,6742
906
906
  datahub/sdk/entity_client.py,sha256=Sxe6H6Vr_tqLJu5KW7MJfLWJ6mgh4mbsx7u7MOBpM64,5052
907
- datahub/sdk/main_client.py,sha256=WK3-cWDNaTXzeYd_aKoXhnJVywcWDddxDQjV4bGmIBI,3617
907
+ datahub/sdk/main_client.py,sha256=h2MKRhR-BO0zGCMhF7z2bTncX4hagKrAYwR3wTNTtzA,3666
908
908
  datahub/sdk/resolver_client.py,sha256=nKMAZJt2tRSGfKSzoREIh43PXqjM3umLiYkYHJjo1io,3243
909
909
  datahub/sdk/search_client.py,sha256=h9O_rsphkTdpd5hMPay3xSXfJM761cf4PjNCBwCnFzU,1309
910
- datahub/sdk/search_filters.py,sha256=GGBWvS-iq9CgnTNyc_yw_bQfUSBRa1s6jmIVRdwxiZY,11508
910
+ datahub/sdk/search_filters.py,sha256=WaJKFUKT9P70NBkh36f44rZrJ7zRJpRwu8mN-rEx5y0,11364
911
911
  datahub/secret/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
912
912
  datahub/secret/datahub_secret_store.py,sha256=9u9S87-15jwhj4h0EsAVIMdQLgvstKc8voQux2slxgU,2477
913
913
  datahub/secret/datahub_secrets_client.py,sha256=nDmhziKdvseJHlaDVUcAwK8Fv8maeAaG-ktZtWG2b70,1316
@@ -933,7 +933,7 @@ datahub/sql_parsing/_sqlglot_patch.py,sha256=dpTiGj4Jxzvbi6HkYiOTbZ81CTFwEEo6JgY
933
933
  datahub/sql_parsing/datajob.py,sha256=1X8KpEk-y3_8xJuA_Po27EHZgOcxK9QADI6Om9gSGn0,1751
934
934
  datahub/sql_parsing/query_types.py,sha256=FKjDzszZzsrCfYfm7dgD6T_8865qxWl767fdGyHWBh4,2720
935
935
  datahub/sql_parsing/schema_resolver.py,sha256=ISuingLcQnOJZkNXBkc73uPwYUbbOtERAjgGhJajDiQ,10782
936
- datahub/sql_parsing/split_statements.py,sha256=e1vVGJXqJbLPwjpFS6owROxVI6J0wwccyBV_9qxI0mc,9121
936
+ datahub/sql_parsing/split_statements.py,sha256=OIQXA9e4k3G9Z1y7rbgdtZhMWt4FPnq41cE8Jkm9cBY,9542
937
937
  datahub/sql_parsing/sql_parsing_aggregator.py,sha256=A3_0wSxBJSRowEaslptDpBoKO42XXx5UyTEK9PkekIs,70317
938
938
  datahub/sql_parsing/sql_parsing_common.py,sha256=cZ4WvVyHZuXDGjnBvKMX2_fz2EMextB5WQWcK0_saBo,3155
939
939
  datahub/sql_parsing/sql_parsing_result_utils.py,sha256=prwWTj1EB2fRPv1eMB4EkpFNafIYAt-X8TIK0NWqank,796
@@ -1043,8 +1043,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
1043
1043
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
1044
1044
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
1045
1045
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
1046
- acryl_datahub-1.0.0.1rc3.dist-info/METADATA,sha256=BTS0H-lGX3ONF9A0odwU_U8VbAUTrcd7K56DDTjG9iM,176849
1047
- acryl_datahub-1.0.0.1rc3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1048
- acryl_datahub-1.0.0.1rc3.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1049
- acryl_datahub-1.0.0.1rc3.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1050
- acryl_datahub-1.0.0.1rc3.dist-info/RECORD,,
1046
+ acryl_datahub-1.0.0.1rc4.dist-info/METADATA,sha256=0QZSNfWv2u7u7GcupcTXvYmmBOqeB7vfGNKHUyQEoNs,176849
1047
+ acryl_datahub-1.0.0.1rc4.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
1048
+ acryl_datahub-1.0.0.1rc4.dist-info/entry_points.txt,sha256=o3mDeJXSKhsy7XLkuogihraiabBdLn9HaizYXPrxmk0,9710
1049
+ acryl_datahub-1.0.0.1rc4.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
1050
+ acryl_datahub-1.0.0.1rc4.dist-info/RECORD,,
datahub/_version.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Published at https://pypi.org/project/acryl-datahub/.
2
2
  __package_name__ = "acryl-datahub"
3
- __version__ = "1.0.0.1rc3"
3
+ __version__ = "1.0.0.1rc4"
4
4
 
5
5
 
6
6
  def is_dev_mode() -> bool:
@@ -1,6 +1,8 @@
1
1
  import dataclasses
2
2
  import enum
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Literal, Optional
4
+
5
+ from typing_extensions import TypeAlias
4
6
 
5
7
  from datahub.emitter.mce_builder import (
6
8
  make_data_platform_urn,
@@ -10,11 +12,29 @@ from datahub.utilities.urns.urn import guess_entity_type
10
12
 
11
13
  RawSearchFilterRule = Dict[str, Any]
12
14
 
15
+ # Mirrors our GraphQL enum: https://datahubproject.io/docs/graphql/enums#filteroperator
16
+ FilterOperator: TypeAlias = Literal[
17
+ "CONTAIN",
18
+ "EQUAL",
19
+ "IEQUAL",
20
+ "IN",
21
+ "EXISTS",
22
+ "GREATER_THAN",
23
+ "GREATER_THAN_OR_EQUAL_TO",
24
+ "LESS_THAN",
25
+ "LESS_THAN_OR_EQUAL_TO",
26
+ "START_WITH",
27
+ "END_WITH",
28
+ "DESCENDANTS_INCL",
29
+ "ANCESTORS_INCL",
30
+ "RELATED_INCL",
31
+ ]
32
+
13
33
 
14
34
  @dataclasses.dataclass
15
35
  class SearchFilterRule:
16
36
  field: str
17
- condition: str # TODO: convert to an enum
37
+ condition: FilterOperator
18
38
  values: List[str]
19
39
  negated: bool = False
20
40
 
@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
12
12
  from datahub.ingestion.api.common import PipelineContext
13
13
  from datahub.ingestion.api.source import SourceReport
14
14
  from datahub.ingestion.graph.client import DataHubGraph
15
- from datahub.ingestion.graph.filters import RemovedStatusFilter
15
+ from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
16
16
  from datahub.utilities.lossy_collections import LossyList
17
17
  from datahub.utilities.stats_collections import TopKDict
18
18
  from datahub.utilities.urns._urn_base import Urn
19
+ from datahub.utilities.urns.error import InvalidUrnError
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
22
- QUERY_ENTITIES = """
23
- query listEntities($input: ScrollAcrossEntitiesInput!) {
24
- scrollAcrossEntities(input: $input) {
25
- nextScrollId
26
- count
27
- searchResults {
28
- entity {
29
- ... on QueryEntity {
30
- urn
31
- }
32
- ... on DataProcessInstance {
33
- urn
34
- }
35
- }
36
- }
37
- }
38
- }
39
- """
40
-
41
23
 
42
24
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
43
25
  enabled: bool = Field(
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
64
46
  )
65
47
 
66
48
  entity_types: Optional[List[str]] = Field(
67
- default=None,
49
+ # A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
50
+ default=[
51
+ "dataset",
52
+ "dashboard",
53
+ "chart",
54
+ "mlmodel",
55
+ "mlmodelGroup",
56
+ "mlfeatureTable",
57
+ "mlfeature",
58
+ "mlprimaryKey",
59
+ "dataFlow",
60
+ "dataJob",
61
+ "glossaryTerm",
62
+ "glossaryNode",
63
+ "tag",
64
+ "role",
65
+ "corpuser",
66
+ "corpGroup",
67
+ "container",
68
+ "domain",
69
+ "dataProduct",
70
+ "notebook",
71
+ "businessAttribute",
72
+ "schemaField",
73
+ "query",
74
+ "dataProcessInstance",
75
+ ],
68
76
  description="List of entity types to cleanup",
69
77
  )
70
78
 
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
103
111
  num_entities_found: Dict[str, int] = field(default_factory=dict)
104
112
  num_soft_deleted_entity_processed: int = 0
105
113
  num_soft_deleted_retained_due_to_age: int = 0
114
+ num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
115
+ default_factory=TopKDict
116
+ )
106
117
  num_soft_deleted_entity_removal_started: int = 0
107
118
  num_hard_deleted: int = 0
108
119
  num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
111
122
  )
112
123
  runtime_limit_reached: bool = False
113
124
  deletion_limit_reached: bool = False
125
+ num_soft_deleted_entity_found: int = 0
126
+ num_soft_deleted_entity_invalid_urn: int = 0
114
127
 
115
128
 
116
129
  class SoftDeletedEntitiesCleanup:
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
133
146
  self.config = config
134
147
  self.report = report
135
148
  self.dry_run = dry_run
136
- self.start_time = 0.0
149
+ self.start_time = time.time()
137
150
  self._report_lock: Lock = Lock()
138
151
  self.last_print_time = 0.0
139
152
 
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
142
155
  with self._report_lock:
143
156
  self.report.num_soft_deleted_retained_due_to_age += 1
144
157
 
158
+ def _increment_retained_by_type(self, type: str) -> None:
159
+ """Thread-safe method to update report fields"""
160
+ with self._report_lock:
161
+ self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
162
+ self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
163
+ + 1
164
+ )
165
+
145
166
  def _increment_removal_started_count(self) -> None:
146
167
  """Thread-safe method to update report fields"""
147
168
  with self._report_lock:
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
160
181
  )
161
182
  self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
162
183
 
163
- def delete_entity(self, urn: str) -> None:
184
+ def delete_entity(self, urn: Urn) -> None:
164
185
  assert self.ctx.graph
165
186
 
166
- entity_urn = Urn.from_string(urn)
167
187
  if self.dry_run:
168
188
  logger.info(
169
189
  f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
172
192
  if self._deletion_limit_reached() or self._times_up():
173
193
  return
174
194
  self._increment_removal_started_count()
175
- self.ctx.graph.delete_entity(urn=urn, hard=True)
195
+ self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
176
196
  self.ctx.graph.delete_references_to_urn(
177
- urn=urn,
197
+ urn=urn.urn(),
178
198
  dry_run=False,
179
199
  )
180
- self._update_report(urn, entity_urn.entity_type)
200
+ self._update_report(urn.urn(), urn.entity_type)
181
201
 
182
- def delete_soft_deleted_entity(self, urn: str) -> None:
202
+ def delete_soft_deleted_entity(self, urn: Urn) -> None:
183
203
  assert self.ctx.graph
184
204
 
185
205
  retention_time = (
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
187
207
  - self.config.retention_days * 24 * 60 * 60
188
208
  )
189
209
 
190
- aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
210
+ aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
191
211
  if "status" in aspect["aspects"]:
192
212
  if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
193
213
  "status"
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
196
216
  self.delete_entity(urn)
197
217
  else:
198
218
  self._increment_retained_count()
219
+ self._increment_retained_by_type(urn.entity_type)
199
220
 
200
221
  def _print_report(self) -> None:
201
222
  time_taken = round(time.time() - self.last_print_time, 1)
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
204
225
  self.last_print_time = time.time()
205
226
  logger.info(f"\n{self.report.as_string()}")
206
227
 
207
- def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
228
+ def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
208
229
  done, not_done = wait(futures, return_when=FIRST_COMPLETED)
209
230
  futures = {future: urn for future, urn in futures.items() if future in not_done}
210
231
 
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
214
235
  self.report.failure(
215
236
  title="Failed to delete entity",
216
237
  message="Failed to delete entity",
217
- context=futures[future],
238
+ context=futures[future].urn(),
218
239
  exc=future.exception(),
219
240
  )
220
241
  self.report.num_soft_deleted_entity_processed += 1
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
229
250
  time.sleep(self.config.delay)
230
251
  return futures
231
252
 
232
- def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
253
+ def _get_urns(self) -> Iterable[str]:
233
254
  assert self.ctx.graph
234
- scroll_id: Optional[str] = None
235
-
236
- batch_size = self.config.batch_size
237
- if entity_type == "DATA_PROCESS_INSTANCE":
238
- # Due to a bug in Data process instance querying this is a temp workaround
239
- # to avoid a giant stacktrace by having a smaller batch size in first call
240
- # This will be remove in future version after server with fix has been
241
- # around for a while
242
- batch_size = 10
243
-
244
- while True:
245
- try:
246
- if entity_type not in self.report.num_calls_made:
247
- self.report.num_calls_made[entity_type] = 1
248
- else:
249
- self.report.num_calls_made[entity_type] += 1
250
- self._print_report()
251
- result = self.ctx.graph.execute_graphql(
252
- graphql_query,
253
- {
254
- "input": {
255
- "types": [entity_type],
256
- "query": "*",
257
- "scrollId": scroll_id if scroll_id else None,
258
- "count": batch_size,
259
- "orFilters": [
260
- {
261
- "and": [
262
- {
263
- "field": "removed",
264
- "values": ["true"],
265
- "condition": "EQUAL",
266
- }
267
- ]
268
- }
269
- ],
270
- }
271
- },
272
- )
273
- except Exception as e:
274
- self.report.failure(
275
- f"While trying to get {entity_type} with {scroll_id}", exc=e
276
- )
277
- break
278
- scroll_across_entities = result.get("scrollAcrossEntities")
279
- if not scroll_across_entities:
280
- break
281
- search_results = scroll_across_entities.get("searchResults")
282
- count = scroll_across_entities.get("count")
283
- if not count or not search_results:
284
- # Due to a server bug we cannot rely on just count as it was returning response like this
285
- # {'count': 1, 'nextScrollId': None, 'searchResults': []}
286
- break
287
- if entity_type == "DATA_PROCESS_INSTANCE":
288
- # Temp workaround. See note in beginning of the function
289
- # We make the batch size = config after call has succeeded once
290
- batch_size = self.config.batch_size
291
- scroll_id = scroll_across_entities.get("nextScrollId")
292
- if entity_type not in self.report.num_entities_found:
293
- self.report.num_entities_found[entity_type] = 0
294
- self.report.num_entities_found[entity_type] += scroll_across_entities.get(
295
- "count"
255
+ # Entities created in the retention period are not considered for deletion
256
+ created_from = int(
257
+ (
258
+ datetime.now(timezone.utc).timestamp()
259
+ - self.config.retention_days * 24 * 60 * 60
296
260
  )
297
- for query in search_results:
298
- yield query["entity"]["urn"]
261
+ * 1000
262
+ )
263
+
264
+ entity_types = self.config.entity_types
265
+ # dataProcessInstance is a special case where we need to get the entities separately
266
+ # because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
267
+ # Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
268
+ if (
269
+ self.config.entity_types
270
+ and "dataProcessInstance" in self.config.entity_types
271
+ ):
272
+ entity_types = self.config.entity_types.copy()
273
+ yield from self.ctx.graph.get_urns_by_filter(
274
+ entity_types=["dataProcessInstance"],
275
+ platform=self.config.platform,
276
+ env=self.config.env,
277
+ query=self.config.query,
278
+ status=RemovedStatusFilter.ONLY_SOFT_DELETED,
279
+ batch_size=self.config.batch_size,
280
+ extraFilters=[
281
+ SearchFilterRule(
282
+ field="created",
283
+ condition="LESS_THAN",
284
+ values=[f"{created_from}"],
285
+ ).to_raw()
286
+ ],
287
+ )
288
+
289
+ entity_types.remove("dataProcessInstance")
299
290
 
300
- def _get_urns(self) -> Iterable[str]:
301
- assert self.ctx.graph
302
291
  yield from self.ctx.graph.get_urns_by_filter(
303
- entity_types=self.config.entity_types,
292
+ entity_types=entity_types,
304
293
  platform=self.config.platform,
305
294
  env=self.config.env,
306
295
  query=self.config.query,
307
296
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
308
297
  batch_size=self.config.batch_size,
309
298
  )
310
- yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
311
- yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
312
299
 
313
300
  def _times_up(self) -> bool:
314
301
  if (
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
335
322
  return
336
323
  self.start_time = time.time()
337
324
 
338
- futures: Dict[Future, str] = dict()
325
+ futures: Dict[Future, Urn] = dict()
339
326
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
340
327
  for urn in self._get_urns():
328
+ try:
329
+ self.report.num_soft_deleted_entity_found += 1
330
+ soft_deleted_urn = Urn.from_string(urn)
331
+ except InvalidUrnError as e:
332
+ logger.error(f"Failed to parse urn {urn} with error {e}")
333
+ self.report.num_soft_deleted_entity_invalid_urn += 1
334
+ continue
335
+
341
336
  self._print_report()
342
337
  while len(futures) >= self.config.futures_max_at_time:
343
338
  futures = self._process_futures(futures)
344
339
  if self._deletion_limit_reached() or self._times_up():
345
340
  break
346
- future = executor.submit(self.delete_soft_deleted_entity, urn)
347
- futures[future] = urn
341
+ future = executor.submit(
342
+ self.delete_soft_deleted_entity, soft_deleted_urn
343
+ )
344
+ futures[future] = soft_deleted_urn
348
345
 
349
346
  logger.info(f"Waiting for {len(futures)} futures to complete")
350
347
  while len(futures) > 0:
@@ -602,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
602
602
  if not self.config.include_field_median_value:
603
603
  return
604
604
  try:
605
- if self.dataset.engine.dialect.name.lower() in [SNOWFLAKE, DATABRICKS]:
605
+ if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
606
606
  column_profile.median = str(
607
607
  self.dataset.engine.execute(
608
608
  sa.select([sa.func.median(sa.column(column))]).select_from(
@@ -610,6 +610,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
610
610
  )
611
611
  ).scalar()
612
612
  )
613
+ elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
614
+ column_profile.median = str(
615
+ self.dataset.engine.execute(
616
+ sa.select(
617
+ sa.text(
618
+ f"approx_percentile(`{column}`, 0.5) as approx_median"
619
+ )
620
+ ).select_from(self.dataset._table)
621
+ ).scalar()
622
+ )
613
623
  elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
614
624
  column_profile.median = str(
615
625
  self.dataset.engine.execute(
@@ -400,6 +400,10 @@ class RedshiftSqlLineageV2(Closeable):
400
400
  db_schemas: Dict[str, Dict[str, RedshiftSchema]],
401
401
  ) -> None:
402
402
  for schema_name, tables in all_tables[self.database].items():
403
+ logger.info(f"External table lineage: checking schema {schema_name}")
404
+ if not db_schemas[self.database].get(schema_name):
405
+ logger.warning(f"Schema {schema_name} not found")
406
+ continue
403
407
  for table in tables:
404
408
  schema = db_schemas[self.database][schema_name]
405
409
  if (
@@ -407,6 +411,9 @@ class RedshiftSqlLineageV2(Closeable):
407
411
  and schema.is_external_schema()
408
412
  and schema.external_platform
409
413
  ):
414
+ logger.info(
415
+ f"External table lineage: processing table {schema_name}.{table.name}"
416
+ )
410
417
  # external_db_params = schema.option
411
418
  upstream_platform = schema.external_platform.lower()
412
419
 
@@ -44,7 +44,7 @@ class RedshiftCommonQuery:
44
44
  SELECT
45
45
  schema_name,
46
46
  schema_type,
47
- schema_option,
47
+ cast(null as varchar(1024)) as schema_option,
48
48
  cast(null as varchar(256)) as external_platform,
49
49
  cast(null as varchar(256)) as external_database
50
50
  FROM svv_redshift_schemas