acryl-datahub 1.0.0rc17__py3-none-any.whl → 1.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2426 -2427
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +106 -89
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -28
- datahub/cli/specific/dataset_cli.py +26 -10
- datahub/emitter/mce_builder.py +1 -3
- datahub/emitter/mcp_builder.py +8 -0
- datahub/emitter/request_helper.py +19 -14
- datahub/emitter/response_helper.py +25 -18
- datahub/emitter/rest_emitter.py +23 -7
- datahub/errors.py +8 -0
- datahub/ingestion/api/source.py +7 -2
- datahub/ingestion/api/source_helpers.py +14 -2
- datahub/ingestion/extractor/schema_util.py +1 -0
- datahub/ingestion/graph/client.py +26 -20
- datahub/ingestion/graph/filters.py +62 -17
- datahub/ingestion/sink/datahub_rest.py +2 -2
- datahub/ingestion/source/cassandra/cassandra.py +1 -10
- datahub/ingestion/source/common/data_platforms.py +23 -0
- datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
- datahub/ingestion/source/common/subtypes.py +17 -1
- datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
- datahub/ingestion/source/dbt/dbt_common.py +6 -4
- datahub/ingestion/source/dbt/dbt_core.py +4 -6
- datahub/ingestion/source/dbt/dbt_tests.py +8 -6
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
- datahub/ingestion/source/dremio/dremio_entities.py +6 -5
- datahub/ingestion/source/dremio/dremio_source.py +96 -117
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
- datahub/ingestion/source/ge_data_profiler.py +11 -1
- datahub/ingestion/source/hex/__init__.py +0 -0
- datahub/ingestion/source/hex/api.py +394 -0
- datahub/ingestion/source/hex/constants.py +3 -0
- datahub/ingestion/source/hex/hex.py +167 -0
- datahub/ingestion/source/hex/mapper.py +372 -0
- datahub/ingestion/source/hex/model.py +68 -0
- datahub/ingestion/source/iceberg/iceberg.py +193 -140
- datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
- datahub/ingestion/source/mlflow.py +217 -8
- datahub/ingestion/source/mode.py +11 -1
- datahub/ingestion/source/openapi.py +69 -34
- datahub/ingestion/source/powerbi/config.py +31 -4
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
- datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
- datahub/ingestion/source/powerbi/powerbi.py +41 -24
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
- datahub/ingestion/source/redshift/lineage_v2.py +9 -1
- datahub/ingestion/source/redshift/query.py +1 -1
- datahub/ingestion/source/s3/source.py +11 -0
- datahub/ingestion/source/sigma/config.py +3 -4
- datahub/ingestion/source/sigma/sigma.py +10 -6
- datahub/ingestion/source/slack/slack.py +399 -82
- datahub/ingestion/source/snowflake/constants.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
- datahub/ingestion/source/snowflake/snowflake_queries.py +16 -13
- datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
- datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
- datahub/ingestion/source/sql/mssql/job_models.py +15 -1
- datahub/ingestion/source/sql/mssql/source.py +8 -4
- datahub/ingestion/source/sql/oracle.py +51 -4
- datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
- datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
- datahub/ingestion/source/superset.py +291 -35
- datahub/ingestion/source/usage/usage_common.py +0 -65
- datahub/ingestion/source/vertexai/__init__.py +0 -0
- datahub/ingestion/source/vertexai/vertexai.py +1055 -0
- datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
- datahub/metadata/_schema_classes.py +472 -1
- datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
- datahub/metadata/schema.avsc +313 -2
- datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
- datahub/metadata/schemas/CorpUserKey.avsc +2 -1
- datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/Deprecation.avsc +2 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
- datahub/metadata/schemas/QueryProperties.avsc +20 -0
- datahub/metadata/schemas/Siblings.avsc +2 -0
- datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
- datahub/sdk/__init__.py +1 -0
- datahub/sdk/dataset.py +122 -0
- datahub/sdk/entity.py +99 -3
- datahub/sdk/entity_client.py +27 -3
- datahub/sdk/main_client.py +24 -1
- datahub/sdk/search_client.py +81 -8
- datahub/sdk/search_filters.py +94 -37
- datahub/sql_parsing/split_statements.py +17 -3
- datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
- datahub/sql_parsing/tool_meta_extractor.py +27 -2
- datahub/testing/mcp_diff.py +1 -18
- datahub/utilities/threaded_iterator_executor.py +16 -3
- datahub/ingestion/source/vertexai.py +0 -697
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc17.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import re
|
|
3
|
-
from collections import defaultdict
|
|
4
2
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
3
|
+
from dataclasses import dataclass
|
|
5
4
|
from typing import Dict, Iterable, List, Optional
|
|
6
5
|
|
|
7
6
|
from datahub.emitter.mce_builder import (
|
|
@@ -28,7 +27,10 @@ from datahub.ingestion.source.dremio.dremio_api import (
|
|
|
28
27
|
DremioEdition,
|
|
29
28
|
)
|
|
30
29
|
from datahub.ingestion.source.dremio.dremio_aspects import DremioAspects
|
|
31
|
-
from datahub.ingestion.source.dremio.dremio_config import
|
|
30
|
+
from datahub.ingestion.source.dremio.dremio_config import (
|
|
31
|
+
DremioSourceConfig,
|
|
32
|
+
DremioSourceMapping,
|
|
33
|
+
)
|
|
32
34
|
from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
|
|
33
35
|
DremioToDataHubSourceTypeMapping,
|
|
34
36
|
)
|
|
@@ -39,6 +41,7 @@ from datahub.ingestion.source.dremio.dremio_entities import (
|
|
|
39
41
|
DremioDatasetType,
|
|
40
42
|
DremioGlossaryTerm,
|
|
41
43
|
DremioQuery,
|
|
44
|
+
DremioSourceContainer,
|
|
42
45
|
)
|
|
43
46
|
from datahub.ingestion.source.dremio.dremio_profiling import DremioProfiler
|
|
44
47
|
from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
|
|
@@ -65,6 +68,17 @@ from datahub.sql_parsing.sql_parsing_aggregator import (
|
|
|
65
68
|
logger = logging.getLogger(__name__)
|
|
66
69
|
|
|
67
70
|
|
|
71
|
+
@dataclass
|
|
72
|
+
class DremioSourceMapEntry:
|
|
73
|
+
platform: str
|
|
74
|
+
source_name: str
|
|
75
|
+
dremio_source_category: str
|
|
76
|
+
root_path: str = ""
|
|
77
|
+
database_name: str = ""
|
|
78
|
+
platform_instance: Optional[str] = None
|
|
79
|
+
env: Optional[str] = None
|
|
80
|
+
|
|
81
|
+
|
|
68
82
|
@platform_name("Dremio")
|
|
69
83
|
@config_class(DremioSourceConfig)
|
|
70
84
|
@support_status(SupportStatus.CERTIFIED)
|
|
@@ -112,7 +126,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
112
126
|
self.default_db = "dremio"
|
|
113
127
|
self.config = config
|
|
114
128
|
self.report = DremioSourceReport()
|
|
115
|
-
self.source_map: Dict[str,
|
|
129
|
+
self.source_map: Dict[str, DremioSourceMapEntry] = dict()
|
|
116
130
|
|
|
117
131
|
# Initialize API operations
|
|
118
132
|
dremio_api = DremioAPIOperations(self.config, self.report)
|
|
@@ -152,111 +166,12 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
152
166
|
def get_platform(self) -> str:
|
|
153
167
|
return "dremio"
|
|
154
168
|
|
|
155
|
-
def _build_source_map(self) -> Dict[str,
|
|
156
|
-
"""
|
|
157
|
-
Builds a source mapping dictionary to support external lineage generation across
|
|
158
|
-
multiple Dremio sources, based on provided configuration mappings.
|
|
159
|
-
|
|
160
|
-
This method operates as follows:
|
|
161
|
-
|
|
162
|
-
1. If a source mapping is present in the config:
|
|
163
|
-
- For each source in the Dremio catalog, if the mapping's `source_name` matches
|
|
164
|
-
the `dremio_source_type`, `root_path` and `database_name` are added to the mapping
|
|
165
|
-
information, along with the platform, platform instance, and environment if they exist.
|
|
166
|
-
This allows constructing the full URN for upstream lineage.
|
|
167
|
-
|
|
168
|
-
2. If a source mapping is absent in the configuration:
|
|
169
|
-
- Default mappings are created for each source name, setting `env` and `platform_instance`
|
|
170
|
-
to default values and classifying the source type. This ensures all sources have a
|
|
171
|
-
mapping, even if specific configuration details are missing.
|
|
172
|
-
|
|
173
|
-
Returns:
|
|
174
|
-
Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
|
|
175
|
-
(lowercased) and each value is another dictionary containing:
|
|
176
|
-
- `platform`: The source platform.
|
|
177
|
-
- `source_name`: The source name.
|
|
178
|
-
- `dremio_source_type`: The type mapped to DataHub,
|
|
179
|
-
e.g., "database", "folder".
|
|
180
|
-
- Optional `root_path`, `database_name`, `platform_instance`,
|
|
181
|
-
and `env` if provided in the configuration.
|
|
182
|
-
Example:
|
|
183
|
-
This method is used internally within the class to generate mappings before
|
|
184
|
-
creating cross-platform lineage.
|
|
185
|
-
|
|
186
|
-
"""
|
|
187
|
-
|
|
188
|
-
source_map = {}
|
|
169
|
+
def _build_source_map(self) -> Dict[str, DremioSourceMapEntry]:
|
|
189
170
|
dremio_sources = self.dremio_catalog.get_sources()
|
|
171
|
+
source_mappings_config = self.config.source_mappings or []
|
|
190
172
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
if isinstance(source.dremio_source_type, str):
|
|
194
|
-
source_type = source.dremio_source_type.lower()
|
|
195
|
-
root_path = source.root_path.lower() if source.root_path else ""
|
|
196
|
-
database_name = (
|
|
197
|
-
source.database_name.lower() if source.database_name else ""
|
|
198
|
-
)
|
|
199
|
-
source_present = False
|
|
200
|
-
source_platform_name = source_name
|
|
201
|
-
|
|
202
|
-
for mapping in self.config.source_mappings or []:
|
|
203
|
-
if re.search(mapping.source_name, source_type, re.IGNORECASE):
|
|
204
|
-
source_platform_name = mapping.source_name.lower()
|
|
205
|
-
|
|
206
|
-
datahub_source_type = (
|
|
207
|
-
DremioToDataHubSourceTypeMapping.get_datahub_source_type(
|
|
208
|
-
source_type
|
|
209
|
-
)
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
if re.search(mapping.platform, datahub_source_type, re.IGNORECASE):
|
|
213
|
-
source_platform_name = source_platform_name.lower()
|
|
214
|
-
source_map[source_platform_name] = {
|
|
215
|
-
"platform": mapping.platform,
|
|
216
|
-
"source_name": mapping.source_name,
|
|
217
|
-
"dremio_source_type": DremioToDataHubSourceTypeMapping.get_category(
|
|
218
|
-
source_type,
|
|
219
|
-
),
|
|
220
|
-
"root_path": root_path,
|
|
221
|
-
"database_name": database_name,
|
|
222
|
-
"platform_instance": mapping.platform_instance,
|
|
223
|
-
"env": mapping.env,
|
|
224
|
-
}
|
|
225
|
-
source_present = True
|
|
226
|
-
break
|
|
227
|
-
|
|
228
|
-
if not source_present:
|
|
229
|
-
try:
|
|
230
|
-
dremio_source_type = (
|
|
231
|
-
DremioToDataHubSourceTypeMapping.get_category(source_type)
|
|
232
|
-
)
|
|
233
|
-
except Exception as exc:
|
|
234
|
-
logger.info(
|
|
235
|
-
f"Source {source_type} is not a standard Dremio source type. "
|
|
236
|
-
f"Adding source_type {source_type} to mapping as database. Error: {exc}"
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
DremioToDataHubSourceTypeMapping.add_mapping(
|
|
240
|
-
source_type, source_name
|
|
241
|
-
)
|
|
242
|
-
dremio_source_type = (
|
|
243
|
-
DremioToDataHubSourceTypeMapping.get_category(source_type)
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
source_map[source_platform_name.lower()] = {
|
|
247
|
-
"platform": source_type,
|
|
248
|
-
"source_name": source_name,
|
|
249
|
-
"dremio_source_type": dremio_source_type,
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
else:
|
|
253
|
-
logger.error(
|
|
254
|
-
f'Source "{source.container_name}" is broken. Containers will not be created for source.'
|
|
255
|
-
)
|
|
256
|
-
logger.error(
|
|
257
|
-
f'No new cross-platform lineage will be emitted for source "{source.container_name}".'
|
|
258
|
-
)
|
|
259
|
-
logger.error("Fix this source in Dremio to fix this issue.")
|
|
173
|
+
source_map = build_dremio_source_map(dremio_sources, source_mappings_config)
|
|
174
|
+
logger.info(f"Full source map: {source_map}")
|
|
260
175
|
|
|
261
176
|
return source_map
|
|
262
177
|
|
|
@@ -431,6 +346,7 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
431
346
|
dremio_path=dataset_info.path,
|
|
432
347
|
dremio_dataset=dataset_info.resource_name,
|
|
433
348
|
)
|
|
349
|
+
logger.debug(f"Upstream dataset for {dataset_urn}: {upstream_urn}")
|
|
434
350
|
|
|
435
351
|
if upstream_urn:
|
|
436
352
|
upstream_lineage = UpstreamLineage(
|
|
@@ -596,25 +512,23 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
596
512
|
if not mapping:
|
|
597
513
|
return None
|
|
598
514
|
|
|
599
|
-
platform = mapping.
|
|
515
|
+
platform = mapping.platform
|
|
600
516
|
if not platform:
|
|
601
517
|
return None
|
|
602
518
|
|
|
603
|
-
platform_instance = mapping.
|
|
604
|
-
|
|
605
|
-
)
|
|
606
|
-
env = mapping.get("env", self.config.env)
|
|
519
|
+
platform_instance = mapping.platform_instance
|
|
520
|
+
env = mapping.env or self.config.env
|
|
607
521
|
|
|
608
522
|
root_path = ""
|
|
609
523
|
database_name = ""
|
|
610
524
|
|
|
611
|
-
if mapping.
|
|
612
|
-
if mapping.
|
|
613
|
-
root_path = f"{mapping
|
|
525
|
+
if mapping.dremio_source_category == "file_object_storage":
|
|
526
|
+
if mapping.root_path:
|
|
527
|
+
root_path = f"{mapping.root_path[1:]}/"
|
|
614
528
|
dremio_dataset = f"{root_path}{'/'.join(dremio_path[1:])}/{dremio_dataset}"
|
|
615
529
|
else:
|
|
616
|
-
if mapping.
|
|
617
|
-
database_name = f"{mapping
|
|
530
|
+
if mapping.database_name:
|
|
531
|
+
database_name = f"{mapping.database_name}."
|
|
618
532
|
dremio_dataset = (
|
|
619
533
|
f"{database_name}{'.'.join(dremio_path[1:])}.{dremio_dataset}"
|
|
620
534
|
)
|
|
@@ -639,3 +553,68 @@ class DremioSource(StatefulIngestionSourceBase):
|
|
|
639
553
|
Get the source report.
|
|
640
554
|
"""
|
|
641
555
|
return self.report
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def build_dremio_source_map(
|
|
559
|
+
dremio_sources: Iterable[DremioSourceContainer],
|
|
560
|
+
source_mappings_config: List[DremioSourceMapping],
|
|
561
|
+
) -> Dict[str, DremioSourceMapEntry]:
|
|
562
|
+
"""
|
|
563
|
+
Builds a source mapping dictionary to support external lineage generation across
|
|
564
|
+
multiple Dremio sources, based on provided configuration mappings.
|
|
565
|
+
|
|
566
|
+
This method operates as follows:
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
|
|
570
|
+
(lowercased) and each value is another entry containing:
|
|
571
|
+
- `platform`: The source platform.
|
|
572
|
+
- `source_name`: The source name.
|
|
573
|
+
- `dremio_source_category`: The type mapped to DataHub,
|
|
574
|
+
e.g., "database", "folder".
|
|
575
|
+
- Optional `root_path`, `database_name`, `platform_instance`,
|
|
576
|
+
and `env` if provided in the configuration.
|
|
577
|
+
Example:
|
|
578
|
+
This method is used internally within the class to generate mappings before
|
|
579
|
+
creating cross-platform lineage.
|
|
580
|
+
|
|
581
|
+
"""
|
|
582
|
+
source_map = {}
|
|
583
|
+
for source in dremio_sources:
|
|
584
|
+
current_source_name = source.container_name
|
|
585
|
+
|
|
586
|
+
source_type = source.dremio_source_type.lower()
|
|
587
|
+
source_category = DremioToDataHubSourceTypeMapping.get_category(source_type)
|
|
588
|
+
datahub_platform = DremioToDataHubSourceTypeMapping.get_datahub_platform(
|
|
589
|
+
source_type
|
|
590
|
+
)
|
|
591
|
+
root_path = source.root_path.lower() if source.root_path else ""
|
|
592
|
+
database_name = source.database_name.lower() if source.database_name else ""
|
|
593
|
+
source_present = False
|
|
594
|
+
|
|
595
|
+
for mapping in source_mappings_config:
|
|
596
|
+
if mapping.source_name.lower() == current_source_name.lower():
|
|
597
|
+
source_map[current_source_name.lower()] = DremioSourceMapEntry(
|
|
598
|
+
platform=mapping.platform,
|
|
599
|
+
source_name=mapping.source_name,
|
|
600
|
+
dremio_source_category=source_category,
|
|
601
|
+
root_path=root_path,
|
|
602
|
+
database_name=database_name,
|
|
603
|
+
platform_instance=mapping.platform_instance,
|
|
604
|
+
env=mapping.env,
|
|
605
|
+
)
|
|
606
|
+
source_present = True
|
|
607
|
+
break
|
|
608
|
+
|
|
609
|
+
if not source_present:
|
|
610
|
+
source_map[current_source_name.lower()] = DremioSourceMapEntry(
|
|
611
|
+
platform=datahub_platform,
|
|
612
|
+
source_name=current_source_name,
|
|
613
|
+
dremio_source_category=source_category,
|
|
614
|
+
root_path=root_path,
|
|
615
|
+
database_name=database_name,
|
|
616
|
+
platform_instance=None,
|
|
617
|
+
env=None,
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
return source_map
|
|
@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
|
|
|
12
12
|
from datahub.ingestion.api.common import PipelineContext
|
|
13
13
|
from datahub.ingestion.api.source import SourceReport
|
|
14
14
|
from datahub.ingestion.graph.client import DataHubGraph
|
|
15
|
-
from datahub.ingestion.graph.filters import RemovedStatusFilter
|
|
15
|
+
from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
|
|
16
16
|
from datahub.utilities.lossy_collections import LossyList
|
|
17
17
|
from datahub.utilities.stats_collections import TopKDict
|
|
18
18
|
from datahub.utilities.urns._urn_base import Urn
|
|
19
|
+
from datahub.utilities.urns.error import InvalidUrnError
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
21
22
|
|
|
22
|
-
QUERY_ENTITIES = """
|
|
23
|
-
query listEntities($input: ScrollAcrossEntitiesInput!) {
|
|
24
|
-
scrollAcrossEntities(input: $input) {
|
|
25
|
-
nextScrollId
|
|
26
|
-
count
|
|
27
|
-
searchResults {
|
|
28
|
-
entity {
|
|
29
|
-
... on QueryEntity {
|
|
30
|
-
urn
|
|
31
|
-
}
|
|
32
|
-
... on DataProcessInstance {
|
|
33
|
-
urn
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
23
|
|
|
42
24
|
class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
43
25
|
enabled: bool = Field(
|
|
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
|
|
|
64
46
|
)
|
|
65
47
|
|
|
66
48
|
entity_types: Optional[List[str]] = Field(
|
|
67
|
-
default
|
|
49
|
+
# A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
|
|
50
|
+
default=[
|
|
51
|
+
"dataset",
|
|
52
|
+
"dashboard",
|
|
53
|
+
"chart",
|
|
54
|
+
"mlmodel",
|
|
55
|
+
"mlmodelGroup",
|
|
56
|
+
"mlfeatureTable",
|
|
57
|
+
"mlfeature",
|
|
58
|
+
"mlprimaryKey",
|
|
59
|
+
"dataFlow",
|
|
60
|
+
"dataJob",
|
|
61
|
+
"glossaryTerm",
|
|
62
|
+
"glossaryNode",
|
|
63
|
+
"tag",
|
|
64
|
+
"role",
|
|
65
|
+
"corpuser",
|
|
66
|
+
"corpGroup",
|
|
67
|
+
"container",
|
|
68
|
+
"domain",
|
|
69
|
+
"dataProduct",
|
|
70
|
+
"notebook",
|
|
71
|
+
"businessAttribute",
|
|
72
|
+
"schemaField",
|
|
73
|
+
"query",
|
|
74
|
+
"dataProcessInstance",
|
|
75
|
+
],
|
|
68
76
|
description="List of entity types to cleanup",
|
|
69
77
|
)
|
|
70
78
|
|
|
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
|
|
|
103
111
|
num_entities_found: Dict[str, int] = field(default_factory=dict)
|
|
104
112
|
num_soft_deleted_entity_processed: int = 0
|
|
105
113
|
num_soft_deleted_retained_due_to_age: int = 0
|
|
114
|
+
num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
|
|
115
|
+
default_factory=TopKDict
|
|
116
|
+
)
|
|
106
117
|
num_soft_deleted_entity_removal_started: int = 0
|
|
107
118
|
num_hard_deleted: int = 0
|
|
108
119
|
num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
|
|
|
111
122
|
)
|
|
112
123
|
runtime_limit_reached: bool = False
|
|
113
124
|
deletion_limit_reached: bool = False
|
|
125
|
+
num_soft_deleted_entity_found: int = 0
|
|
126
|
+
num_soft_deleted_entity_invalid_urn: int = 0
|
|
114
127
|
|
|
115
128
|
|
|
116
129
|
class SoftDeletedEntitiesCleanup:
|
|
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
133
146
|
self.config = config
|
|
134
147
|
self.report = report
|
|
135
148
|
self.dry_run = dry_run
|
|
136
|
-
self.start_time =
|
|
149
|
+
self.start_time = time.time()
|
|
137
150
|
self._report_lock: Lock = Lock()
|
|
138
151
|
self.last_print_time = 0.0
|
|
139
152
|
|
|
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
|
|
|
142
155
|
with self._report_lock:
|
|
143
156
|
self.report.num_soft_deleted_retained_due_to_age += 1
|
|
144
157
|
|
|
158
|
+
def _increment_retained_by_type(self, type: str) -> None:
|
|
159
|
+
"""Thread-safe method to update report fields"""
|
|
160
|
+
with self._report_lock:
|
|
161
|
+
self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
|
|
162
|
+
self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
|
|
163
|
+
+ 1
|
|
164
|
+
)
|
|
165
|
+
|
|
145
166
|
def _increment_removal_started_count(self) -> None:
|
|
146
167
|
"""Thread-safe method to update report fields"""
|
|
147
168
|
with self._report_lock:
|
|
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
|
|
|
160
181
|
)
|
|
161
182
|
self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
|
|
162
183
|
|
|
163
|
-
def delete_entity(self, urn:
|
|
184
|
+
def delete_entity(self, urn: Urn) -> None:
|
|
164
185
|
assert self.ctx.graph
|
|
165
186
|
|
|
166
|
-
entity_urn = Urn.from_string(urn)
|
|
167
187
|
if self.dry_run:
|
|
168
188
|
logger.info(
|
|
169
189
|
f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
|
|
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
|
|
|
172
192
|
if self._deletion_limit_reached() or self._times_up():
|
|
173
193
|
return
|
|
174
194
|
self._increment_removal_started_count()
|
|
175
|
-
self.ctx.graph.delete_entity(urn=urn, hard=True)
|
|
195
|
+
self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
|
|
176
196
|
self.ctx.graph.delete_references_to_urn(
|
|
177
|
-
urn=urn,
|
|
197
|
+
urn=urn.urn(),
|
|
178
198
|
dry_run=False,
|
|
179
199
|
)
|
|
180
|
-
self._update_report(urn,
|
|
200
|
+
self._update_report(urn.urn(), urn.entity_type)
|
|
181
201
|
|
|
182
|
-
def delete_soft_deleted_entity(self, urn:
|
|
202
|
+
def delete_soft_deleted_entity(self, urn: Urn) -> None:
|
|
183
203
|
assert self.ctx.graph
|
|
184
204
|
|
|
185
205
|
retention_time = (
|
|
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
187
207
|
- self.config.retention_days * 24 * 60 * 60
|
|
188
208
|
)
|
|
189
209
|
|
|
190
|
-
aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
|
|
210
|
+
aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
|
|
191
211
|
if "status" in aspect["aspects"]:
|
|
192
212
|
if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
|
|
193
213
|
"status"
|
|
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
196
216
|
self.delete_entity(urn)
|
|
197
217
|
else:
|
|
198
218
|
self._increment_retained_count()
|
|
219
|
+
self._increment_retained_by_type(urn.entity_type)
|
|
199
220
|
|
|
200
221
|
def _print_report(self) -> None:
|
|
201
222
|
time_taken = round(time.time() - self.last_print_time, 1)
|
|
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
204
225
|
self.last_print_time = time.time()
|
|
205
226
|
logger.info(f"\n{self.report.as_string()}")
|
|
206
227
|
|
|
207
|
-
def _process_futures(self, futures: Dict[Future,
|
|
228
|
+
def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
|
|
208
229
|
done, not_done = wait(futures, return_when=FIRST_COMPLETED)
|
|
209
230
|
futures = {future: urn for future, urn in futures.items() if future in not_done}
|
|
210
231
|
|
|
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
|
|
|
214
235
|
self.report.failure(
|
|
215
236
|
title="Failed to delete entity",
|
|
216
237
|
message="Failed to delete entity",
|
|
217
|
-
context=futures[future],
|
|
238
|
+
context=futures[future].urn(),
|
|
218
239
|
exc=future.exception(),
|
|
219
240
|
)
|
|
220
241
|
self.report.num_soft_deleted_entity_processed += 1
|
|
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
|
|
|
229
250
|
time.sleep(self.config.delay)
|
|
230
251
|
return futures
|
|
231
252
|
|
|
232
|
-
def
|
|
253
|
+
def _get_urns(self) -> Iterable[str]:
|
|
233
254
|
assert self.ctx.graph
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
# to avoid a giant stacktrace by having a smaller batch size in first call
|
|
240
|
-
# This will be remove in future version after server with fix has been
|
|
241
|
-
# around for a while
|
|
242
|
-
batch_size = 10
|
|
243
|
-
|
|
244
|
-
while True:
|
|
245
|
-
try:
|
|
246
|
-
if entity_type not in self.report.num_calls_made:
|
|
247
|
-
self.report.num_calls_made[entity_type] = 1
|
|
248
|
-
else:
|
|
249
|
-
self.report.num_calls_made[entity_type] += 1
|
|
250
|
-
self._print_report()
|
|
251
|
-
result = self.ctx.graph.execute_graphql(
|
|
252
|
-
graphql_query,
|
|
253
|
-
{
|
|
254
|
-
"input": {
|
|
255
|
-
"types": [entity_type],
|
|
256
|
-
"query": "*",
|
|
257
|
-
"scrollId": scroll_id if scroll_id else None,
|
|
258
|
-
"count": batch_size,
|
|
259
|
-
"orFilters": [
|
|
260
|
-
{
|
|
261
|
-
"and": [
|
|
262
|
-
{
|
|
263
|
-
"field": "removed",
|
|
264
|
-
"values": ["true"],
|
|
265
|
-
"condition": "EQUAL",
|
|
266
|
-
}
|
|
267
|
-
]
|
|
268
|
-
}
|
|
269
|
-
],
|
|
270
|
-
}
|
|
271
|
-
},
|
|
272
|
-
)
|
|
273
|
-
except Exception as e:
|
|
274
|
-
self.report.failure(
|
|
275
|
-
f"While trying to get {entity_type} with {scroll_id}", exc=e
|
|
276
|
-
)
|
|
277
|
-
break
|
|
278
|
-
scroll_across_entities = result.get("scrollAcrossEntities")
|
|
279
|
-
if not scroll_across_entities:
|
|
280
|
-
break
|
|
281
|
-
search_results = scroll_across_entities.get("searchResults")
|
|
282
|
-
count = scroll_across_entities.get("count")
|
|
283
|
-
if not count or not search_results:
|
|
284
|
-
# Due to a server bug we cannot rely on just count as it was returning response like this
|
|
285
|
-
# {'count': 1, 'nextScrollId': None, 'searchResults': []}
|
|
286
|
-
break
|
|
287
|
-
if entity_type == "DATA_PROCESS_INSTANCE":
|
|
288
|
-
# Temp workaround. See note in beginning of the function
|
|
289
|
-
# We make the batch size = config after call has succeeded once
|
|
290
|
-
batch_size = self.config.batch_size
|
|
291
|
-
scroll_id = scroll_across_entities.get("nextScrollId")
|
|
292
|
-
if entity_type not in self.report.num_entities_found:
|
|
293
|
-
self.report.num_entities_found[entity_type] = 0
|
|
294
|
-
self.report.num_entities_found[entity_type] += scroll_across_entities.get(
|
|
295
|
-
"count"
|
|
255
|
+
# Entities created in the retention period are not considered for deletion
|
|
256
|
+
created_from = int(
|
|
257
|
+
(
|
|
258
|
+
datetime.now(timezone.utc).timestamp()
|
|
259
|
+
- self.config.retention_days * 24 * 60 * 60
|
|
296
260
|
)
|
|
297
|
-
|
|
298
|
-
|
|
261
|
+
* 1000
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
entity_types = self.config.entity_types
|
|
265
|
+
# dataProcessInstance is a special case where we need to get the entities separately
|
|
266
|
+
# because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
|
|
267
|
+
# Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
|
|
268
|
+
if (
|
|
269
|
+
self.config.entity_types
|
|
270
|
+
and "dataProcessInstance" in self.config.entity_types
|
|
271
|
+
):
|
|
272
|
+
entity_types = self.config.entity_types.copy()
|
|
273
|
+
yield from self.ctx.graph.get_urns_by_filter(
|
|
274
|
+
entity_types=["dataProcessInstance"],
|
|
275
|
+
platform=self.config.platform,
|
|
276
|
+
env=self.config.env,
|
|
277
|
+
query=self.config.query,
|
|
278
|
+
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
279
|
+
batch_size=self.config.batch_size,
|
|
280
|
+
extraFilters=[
|
|
281
|
+
SearchFilterRule(
|
|
282
|
+
field="created",
|
|
283
|
+
condition="LESS_THAN",
|
|
284
|
+
values=[f"{created_from}"],
|
|
285
|
+
).to_raw()
|
|
286
|
+
],
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
entity_types.remove("dataProcessInstance")
|
|
299
290
|
|
|
300
|
-
def _get_urns(self) -> Iterable[str]:
|
|
301
|
-
assert self.ctx.graph
|
|
302
291
|
yield from self.ctx.graph.get_urns_by_filter(
|
|
303
|
-
entity_types=
|
|
292
|
+
entity_types=entity_types,
|
|
304
293
|
platform=self.config.platform,
|
|
305
294
|
env=self.config.env,
|
|
306
295
|
query=self.config.query,
|
|
307
296
|
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
308
297
|
batch_size=self.config.batch_size,
|
|
309
298
|
)
|
|
310
|
-
yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
|
|
311
|
-
yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
|
|
312
299
|
|
|
313
300
|
def _times_up(self) -> bool:
|
|
314
301
|
if (
|
|
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
|
|
|
335
322
|
return
|
|
336
323
|
self.start_time = time.time()
|
|
337
324
|
|
|
338
|
-
futures: Dict[Future,
|
|
325
|
+
futures: Dict[Future, Urn] = dict()
|
|
339
326
|
with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
|
|
340
327
|
for urn in self._get_urns():
|
|
328
|
+
try:
|
|
329
|
+
self.report.num_soft_deleted_entity_found += 1
|
|
330
|
+
soft_deleted_urn = Urn.from_string(urn)
|
|
331
|
+
except InvalidUrnError as e:
|
|
332
|
+
logger.error(f"Failed to parse urn {urn} with error {e}")
|
|
333
|
+
self.report.num_soft_deleted_entity_invalid_urn += 1
|
|
334
|
+
continue
|
|
335
|
+
|
|
341
336
|
self._print_report()
|
|
342
337
|
while len(futures) >= self.config.futures_max_at_time:
|
|
343
338
|
futures = self._process_futures(futures)
|
|
344
339
|
if self._deletion_limit_reached() or self._times_up():
|
|
345
340
|
break
|
|
346
|
-
future = executor.submit(
|
|
347
|
-
|
|
341
|
+
future = executor.submit(
|
|
342
|
+
self.delete_soft_deleted_entity, soft_deleted_urn
|
|
343
|
+
)
|
|
344
|
+
futures[future] = soft_deleted_urn
|
|
348
345
|
|
|
349
346
|
logger.info(f"Waiting for {len(futures)} futures to complete")
|
|
350
347
|
while len(futures) > 0:
|
|
@@ -602,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
602
602
|
if not self.config.include_field_median_value:
|
|
603
603
|
return
|
|
604
604
|
try:
|
|
605
|
-
if self.dataset.engine.dialect.name.lower()
|
|
605
|
+
if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
|
|
606
606
|
column_profile.median = str(
|
|
607
607
|
self.dataset.engine.execute(
|
|
608
608
|
sa.select([sa.func.median(sa.column(column))]).select_from(
|
|
@@ -610,6 +610,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
|
|
|
610
610
|
)
|
|
611
611
|
).scalar()
|
|
612
612
|
)
|
|
613
|
+
elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
|
|
614
|
+
column_profile.median = str(
|
|
615
|
+
self.dataset.engine.execute(
|
|
616
|
+
sa.select(
|
|
617
|
+
sa.text(
|
|
618
|
+
f"approx_percentile(`{column}`, 0.5) as approx_median"
|
|
619
|
+
)
|
|
620
|
+
).select_from(self.dataset._table)
|
|
621
|
+
).scalar()
|
|
622
|
+
)
|
|
613
623
|
elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
|
|
614
624
|
column_profile.median = str(
|
|
615
625
|
self.dataset.engine.execute(
|
|
File without changes
|