acryl-datahub 1.0.0rc18__py3-none-any.whl → 1.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

Files changed (106) hide show
  1. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/METADATA +2391 -2392
  2. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/RECORD +105 -88
  3. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/WHEEL +1 -1
  4. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/entry_points.txt +2 -1
  5. datahub/_version.py +1 -1
  6. datahub/api/entities/dataset/dataset.py +1 -28
  7. datahub/cli/specific/dataset_cli.py +26 -10
  8. datahub/emitter/mce_builder.py +1 -3
  9. datahub/emitter/mcp_builder.py +8 -0
  10. datahub/emitter/request_helper.py +19 -14
  11. datahub/emitter/response_helper.py +25 -18
  12. datahub/emitter/rest_emitter.py +23 -7
  13. datahub/errors.py +8 -0
  14. datahub/ingestion/api/source.py +7 -2
  15. datahub/ingestion/api/source_helpers.py +14 -2
  16. datahub/ingestion/extractor/schema_util.py +1 -0
  17. datahub/ingestion/graph/client.py +26 -20
  18. datahub/ingestion/graph/filters.py +62 -17
  19. datahub/ingestion/sink/datahub_rest.py +2 -2
  20. datahub/ingestion/source/cassandra/cassandra.py +1 -10
  21. datahub/ingestion/source/common/data_platforms.py +23 -0
  22. datahub/ingestion/source/common/gcp_credentials_config.py +6 -0
  23. datahub/ingestion/source/common/subtypes.py +17 -1
  24. datahub/ingestion/source/data_lake_common/path_spec.py +21 -1
  25. datahub/ingestion/source/dbt/dbt_common.py +6 -4
  26. datahub/ingestion/source/dbt/dbt_core.py +4 -6
  27. datahub/ingestion/source/dbt/dbt_tests.py +8 -6
  28. datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +1 -1
  29. datahub/ingestion/source/dremio/dremio_entities.py +6 -5
  30. datahub/ingestion/source/dremio/dremio_source.py +96 -117
  31. datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +101 -104
  32. datahub/ingestion/source/ge_data_profiler.py +11 -1
  33. datahub/ingestion/source/hex/__init__.py +0 -0
  34. datahub/ingestion/source/hex/api.py +394 -0
  35. datahub/ingestion/source/hex/constants.py +3 -0
  36. datahub/ingestion/source/hex/hex.py +167 -0
  37. datahub/ingestion/source/hex/mapper.py +372 -0
  38. datahub/ingestion/source/hex/model.py +68 -0
  39. datahub/ingestion/source/iceberg/iceberg.py +193 -140
  40. datahub/ingestion/source/iceberg/iceberg_profiler.py +21 -18
  41. datahub/ingestion/source/mlflow.py +217 -8
  42. datahub/ingestion/source/mode.py +11 -1
  43. datahub/ingestion/source/openapi.py +69 -34
  44. datahub/ingestion/source/powerbi/config.py +31 -4
  45. datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
  46. datahub/ingestion/source/powerbi/m_query/pattern_handler.py +111 -10
  47. datahub/ingestion/source/powerbi/m_query/resolver.py +10 -0
  48. datahub/ingestion/source/powerbi/powerbi.py +41 -24
  49. datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +11 -11
  50. datahub/ingestion/source/redshift/lineage_v2.py +9 -1
  51. datahub/ingestion/source/redshift/query.py +1 -1
  52. datahub/ingestion/source/s3/source.py +11 -0
  53. datahub/ingestion/source/sigma/config.py +3 -4
  54. datahub/ingestion/source/sigma/sigma.py +10 -6
  55. datahub/ingestion/source/slack/slack.py +399 -82
  56. datahub/ingestion/source/snowflake/constants.py +1 -0
  57. datahub/ingestion/source/snowflake/snowflake_config.py +14 -1
  58. datahub/ingestion/source/snowflake/snowflake_query.py +17 -0
  59. datahub/ingestion/source/snowflake/snowflake_report.py +3 -0
  60. datahub/ingestion/source/snowflake/snowflake_schema.py +29 -0
  61. datahub/ingestion/source/snowflake/snowflake_schema_gen.py +112 -42
  62. datahub/ingestion/source/snowflake/snowflake_utils.py +25 -1
  63. datahub/ingestion/source/sql/mssql/job_models.py +15 -1
  64. datahub/ingestion/source/sql/mssql/source.py +8 -4
  65. datahub/ingestion/source/sql/oracle.py +51 -4
  66. datahub/ingestion/source/sql/stored_procedures/__init__.py +0 -0
  67. datahub/ingestion/source/sql/stored_procedures/base.py +242 -0
  68. datahub/ingestion/source/sql/{mssql/stored_procedure_lineage.py → stored_procedures/lineage.py} +1 -29
  69. datahub/ingestion/source/superset.py +291 -35
  70. datahub/ingestion/source/usage/usage_common.py +0 -65
  71. datahub/ingestion/source/vertexai/__init__.py +0 -0
  72. datahub/ingestion/source/vertexai/vertexai.py +1055 -0
  73. datahub/ingestion/source/vertexai/vertexai_config.py +29 -0
  74. datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +68 -0
  75. datahub/metadata/_schema_classes.py +472 -1
  76. datahub/metadata/com/linkedin/pegasus2avro/dataplatform/slack/__init__.py +15 -0
  77. datahub/metadata/com/linkedin/pegasus2avro/event/__init__.py +11 -0
  78. datahub/metadata/com/linkedin/pegasus2avro/event/notification/__init__.py +15 -0
  79. datahub/metadata/com/linkedin/pegasus2avro/event/notification/settings/__init__.py +19 -0
  80. datahub/metadata/schema.avsc +313 -2
  81. datahub/metadata/schemas/CorpUserEditableInfo.avsc +14 -0
  82. datahub/metadata/schemas/CorpUserKey.avsc +2 -1
  83. datahub/metadata/schemas/CorpUserSettings.avsc +95 -0
  84. datahub/metadata/schemas/DataProcessInstanceInput.avsc +2 -1
  85. datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
  86. datahub/metadata/schemas/Deprecation.avsc +2 -0
  87. datahub/metadata/schemas/MLModelGroupProperties.avsc +16 -0
  88. datahub/metadata/schemas/MetadataChangeEvent.avsc +32 -0
  89. datahub/metadata/schemas/QueryProperties.avsc +20 -0
  90. datahub/metadata/schemas/Siblings.avsc +2 -0
  91. datahub/metadata/schemas/SlackUserInfo.avsc +160 -0
  92. datahub/sdk/__init__.py +1 -0
  93. datahub/sdk/dataset.py +122 -0
  94. datahub/sdk/entity.py +99 -3
  95. datahub/sdk/entity_client.py +27 -3
  96. datahub/sdk/main_client.py +24 -1
  97. datahub/sdk/search_client.py +81 -8
  98. datahub/sdk/search_filters.py +94 -37
  99. datahub/sql_parsing/split_statements.py +17 -3
  100. datahub/sql_parsing/sql_parsing_aggregator.py +6 -0
  101. datahub/sql_parsing/tool_meta_extractor.py +27 -2
  102. datahub/testing/mcp_diff.py +1 -18
  103. datahub/utilities/threaded_iterator_executor.py +16 -3
  104. datahub/ingestion/source/vertexai.py +0 -697
  105. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info/licenses}/LICENSE +0 -0
  106. {acryl_datahub-1.0.0rc18.dist-info → acryl_datahub-1.0.0.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,6 @@
1
1
  import logging
2
- import re
3
- from collections import defaultdict
4
2
  from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from dataclasses import dataclass
5
4
  from typing import Dict, Iterable, List, Optional
6
5
 
7
6
  from datahub.emitter.mce_builder import (
@@ -28,7 +27,10 @@ from datahub.ingestion.source.dremio.dremio_api import (
28
27
  DremioEdition,
29
28
  )
30
29
  from datahub.ingestion.source.dremio.dremio_aspects import DremioAspects
31
- from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
30
+ from datahub.ingestion.source.dremio.dremio_config import (
31
+ DremioSourceConfig,
32
+ DremioSourceMapping,
33
+ )
32
34
  from datahub.ingestion.source.dremio.dremio_datahub_source_mapping import (
33
35
  DremioToDataHubSourceTypeMapping,
34
36
  )
@@ -39,6 +41,7 @@ from datahub.ingestion.source.dremio.dremio_entities import (
39
41
  DremioDatasetType,
40
42
  DremioGlossaryTerm,
41
43
  DremioQuery,
44
+ DremioSourceContainer,
42
45
  )
43
46
  from datahub.ingestion.source.dremio.dremio_profiling import DremioProfiler
44
47
  from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
@@ -65,6 +68,17 @@ from datahub.sql_parsing.sql_parsing_aggregator import (
65
68
  logger = logging.getLogger(__name__)
66
69
 
67
70
 
71
+ @dataclass
72
+ class DremioSourceMapEntry:
73
+ platform: str
74
+ source_name: str
75
+ dremio_source_category: str
76
+ root_path: str = ""
77
+ database_name: str = ""
78
+ platform_instance: Optional[str] = None
79
+ env: Optional[str] = None
80
+
81
+
68
82
  @platform_name("Dremio")
69
83
  @config_class(DremioSourceConfig)
70
84
  @support_status(SupportStatus.CERTIFIED)
@@ -112,7 +126,7 @@ class DremioSource(StatefulIngestionSourceBase):
112
126
  self.default_db = "dremio"
113
127
  self.config = config
114
128
  self.report = DremioSourceReport()
115
- self.source_map: Dict[str, Dict] = defaultdict()
129
+ self.source_map: Dict[str, DremioSourceMapEntry] = dict()
116
130
 
117
131
  # Initialize API operations
118
132
  dremio_api = DremioAPIOperations(self.config, self.report)
@@ -152,111 +166,12 @@ class DremioSource(StatefulIngestionSourceBase):
152
166
  def get_platform(self) -> str:
153
167
  return "dremio"
154
168
 
155
- def _build_source_map(self) -> Dict[str, Dict]:
156
- """
157
- Builds a source mapping dictionary to support external lineage generation across
158
- multiple Dremio sources, based on provided configuration mappings.
159
-
160
- This method operates as follows:
161
-
162
- 1. If a source mapping is present in the config:
163
- - For each source in the Dremio catalog, if the mapping's `source_name` matches
164
- the `dremio_source_type`, `root_path` and `database_name` are added to the mapping
165
- information, along with the platform, platform instance, and environment if they exist.
166
- This allows constructing the full URN for upstream lineage.
167
-
168
- 2. If a source mapping is absent in the configuration:
169
- - Default mappings are created for each source name, setting `env` and `platform_instance`
170
- to default values and classifying the source type. This ensures all sources have a
171
- mapping, even if specific configuration details are missing.
172
-
173
- Returns:
174
- Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
175
- (lowercased) and each value is another dictionary containing:
176
- - `platform`: The source platform.
177
- - `source_name`: The source name.
178
- - `dremio_source_type`: The type mapped to DataHub,
179
- e.g., "database", "folder".
180
- - Optional `root_path`, `database_name`, `platform_instance`,
181
- and `env` if provided in the configuration.
182
- Example:
183
- This method is used internally within the class to generate mappings before
184
- creating cross-platform lineage.
185
-
186
- """
187
-
188
- source_map = {}
169
+ def _build_source_map(self) -> Dict[str, DremioSourceMapEntry]:
189
170
  dremio_sources = self.dremio_catalog.get_sources()
171
+ source_mappings_config = self.config.source_mappings or []
190
172
 
191
- for source in dremio_sources:
192
- source_name = source.container_name
193
- if isinstance(source.dremio_source_type, str):
194
- source_type = source.dremio_source_type.lower()
195
- root_path = source.root_path.lower() if source.root_path else ""
196
- database_name = (
197
- source.database_name.lower() if source.database_name else ""
198
- )
199
- source_present = False
200
- source_platform_name = source_name
201
-
202
- for mapping in self.config.source_mappings or []:
203
- if re.search(mapping.source_name, source_type, re.IGNORECASE):
204
- source_platform_name = mapping.source_name.lower()
205
-
206
- datahub_source_type = (
207
- DremioToDataHubSourceTypeMapping.get_datahub_source_type(
208
- source_type
209
- )
210
- )
211
-
212
- if re.search(mapping.platform, datahub_source_type, re.IGNORECASE):
213
- source_platform_name = source_platform_name.lower()
214
- source_map[source_platform_name] = {
215
- "platform": mapping.platform,
216
- "source_name": mapping.source_name,
217
- "dremio_source_type": DremioToDataHubSourceTypeMapping.get_category(
218
- source_type,
219
- ),
220
- "root_path": root_path,
221
- "database_name": database_name,
222
- "platform_instance": mapping.platform_instance,
223
- "env": mapping.env,
224
- }
225
- source_present = True
226
- break
227
-
228
- if not source_present:
229
- try:
230
- dremio_source_type = (
231
- DremioToDataHubSourceTypeMapping.get_category(source_type)
232
- )
233
- except Exception as exc:
234
- logger.info(
235
- f"Source {source_type} is not a standard Dremio source type. "
236
- f"Adding source_type {source_type} to mapping as database. Error: {exc}"
237
- )
238
-
239
- DremioToDataHubSourceTypeMapping.add_mapping(
240
- source_type, source_name
241
- )
242
- dremio_source_type = (
243
- DremioToDataHubSourceTypeMapping.get_category(source_type)
244
- )
245
-
246
- source_map[source_platform_name.lower()] = {
247
- "platform": source_type,
248
- "source_name": source_name,
249
- "dremio_source_type": dremio_source_type,
250
- }
251
-
252
- else:
253
- logger.error(
254
- f'Source "{source.container_name}" is broken. Containers will not be created for source.'
255
- )
256
- logger.error(
257
- f'No new cross-platform lineage will be emitted for source "{source.container_name}".'
258
- )
259
- logger.error("Fix this source in Dremio to fix this issue.")
173
+ source_map = build_dremio_source_map(dremio_sources, source_mappings_config)
174
+ logger.info(f"Full source map: {source_map}")
260
175
 
261
176
  return source_map
262
177
 
@@ -431,6 +346,7 @@ class DremioSource(StatefulIngestionSourceBase):
431
346
  dremio_path=dataset_info.path,
432
347
  dremio_dataset=dataset_info.resource_name,
433
348
  )
349
+ logger.debug(f"Upstream dataset for {dataset_urn}: {upstream_urn}")
434
350
 
435
351
  if upstream_urn:
436
352
  upstream_lineage = UpstreamLineage(
@@ -596,25 +512,23 @@ class DremioSource(StatefulIngestionSourceBase):
596
512
  if not mapping:
597
513
  return None
598
514
 
599
- platform = mapping.get("platform")
515
+ platform = mapping.platform
600
516
  if not platform:
601
517
  return None
602
518
 
603
- platform_instance = mapping.get(
604
- "platform_instance", self.config.platform_instance
605
- )
606
- env = mapping.get("env", self.config.env)
519
+ platform_instance = mapping.platform_instance
520
+ env = mapping.env or self.config.env
607
521
 
608
522
  root_path = ""
609
523
  database_name = ""
610
524
 
611
- if mapping.get("dremio_source_type") == "file_object_storage":
612
- if mapping.get("root_path"):
613
- root_path = f"{mapping['root_path'][1:]}/"
525
+ if mapping.dremio_source_category == "file_object_storage":
526
+ if mapping.root_path:
527
+ root_path = f"{mapping.root_path[1:]}/"
614
528
  dremio_dataset = f"{root_path}{'/'.join(dremio_path[1:])}/{dremio_dataset}"
615
529
  else:
616
- if mapping.get("database_name"):
617
- database_name = f"{mapping['database_name']}."
530
+ if mapping.database_name:
531
+ database_name = f"{mapping.database_name}."
618
532
  dremio_dataset = (
619
533
  f"{database_name}{'.'.join(dremio_path[1:])}.{dremio_dataset}"
620
534
  )
@@ -639,3 +553,68 @@ class DremioSource(StatefulIngestionSourceBase):
639
553
  Get the source report.
640
554
  """
641
555
  return self.report
556
+
557
+
558
+ def build_dremio_source_map(
559
+ dremio_sources: Iterable[DremioSourceContainer],
560
+ source_mappings_config: List[DremioSourceMapping],
561
+ ) -> Dict[str, DremioSourceMapEntry]:
562
+ """
563
+ Builds a source mapping dictionary to support external lineage generation across
564
+ multiple Dremio sources, based on provided configuration mappings.
565
+
566
+ This method operates as follows:
567
+
568
+ Returns:
569
+ Dict[str, Dict]: A dictionary (`source_map`) where each key is a source name
570
+ (lowercased) and each value is another entry containing:
571
+ - `platform`: The source platform.
572
+ - `source_name`: The source name.
573
+ - `dremio_source_category`: The type mapped to DataHub,
574
+ e.g., "database", "folder".
575
+ - Optional `root_path`, `database_name`, `platform_instance`,
576
+ and `env` if provided in the configuration.
577
+ Example:
578
+ This method is used internally within the class to generate mappings before
579
+ creating cross-platform lineage.
580
+
581
+ """
582
+ source_map = {}
583
+ for source in dremio_sources:
584
+ current_source_name = source.container_name
585
+
586
+ source_type = source.dremio_source_type.lower()
587
+ source_category = DremioToDataHubSourceTypeMapping.get_category(source_type)
588
+ datahub_platform = DremioToDataHubSourceTypeMapping.get_datahub_platform(
589
+ source_type
590
+ )
591
+ root_path = source.root_path.lower() if source.root_path else ""
592
+ database_name = source.database_name.lower() if source.database_name else ""
593
+ source_present = False
594
+
595
+ for mapping in source_mappings_config:
596
+ if mapping.source_name.lower() == current_source_name.lower():
597
+ source_map[current_source_name.lower()] = DremioSourceMapEntry(
598
+ platform=mapping.platform,
599
+ source_name=mapping.source_name,
600
+ dremio_source_category=source_category,
601
+ root_path=root_path,
602
+ database_name=database_name,
603
+ platform_instance=mapping.platform_instance,
604
+ env=mapping.env,
605
+ )
606
+ source_present = True
607
+ break
608
+
609
+ if not source_present:
610
+ source_map[current_source_name.lower()] = DremioSourceMapEntry(
611
+ platform=datahub_platform,
612
+ source_name=current_source_name,
613
+ dremio_source_category=source_category,
614
+ root_path=root_path,
615
+ database_name=database_name,
616
+ platform_instance=None,
617
+ env=None,
618
+ )
619
+
620
+ return source_map
@@ -12,32 +12,14 @@ from datahub.configuration import ConfigModel
12
12
  from datahub.ingestion.api.common import PipelineContext
13
13
  from datahub.ingestion.api.source import SourceReport
14
14
  from datahub.ingestion.graph.client import DataHubGraph
15
- from datahub.ingestion.graph.filters import RemovedStatusFilter
15
+ from datahub.ingestion.graph.filters import RemovedStatusFilter, SearchFilterRule
16
16
  from datahub.utilities.lossy_collections import LossyList
17
17
  from datahub.utilities.stats_collections import TopKDict
18
18
  from datahub.utilities.urns._urn_base import Urn
19
+ from datahub.utilities.urns.error import InvalidUrnError
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
22
- QUERY_ENTITIES = """
23
- query listEntities($input: ScrollAcrossEntitiesInput!) {
24
- scrollAcrossEntities(input: $input) {
25
- nextScrollId
26
- count
27
- searchResults {
28
- entity {
29
- ... on QueryEntity {
30
- urn
31
- }
32
- ... on DataProcessInstance {
33
- urn
34
- }
35
- }
36
- }
37
- }
38
- }
39
- """
40
-
41
23
 
42
24
  class SoftDeletedEntitiesCleanupConfig(ConfigModel):
43
25
  enabled: bool = Field(
@@ -64,7 +46,33 @@ class SoftDeletedEntitiesCleanupConfig(ConfigModel):
64
46
  )
65
47
 
66
48
  entity_types: Optional[List[str]] = Field(
67
- default=None,
49
+ # A default value is required otherwise QUERY and DATAPROCESS_INSTANCE won't be included
50
+ default=[
51
+ "dataset",
52
+ "dashboard",
53
+ "chart",
54
+ "mlmodel",
55
+ "mlmodelGroup",
56
+ "mlfeatureTable",
57
+ "mlfeature",
58
+ "mlprimaryKey",
59
+ "dataFlow",
60
+ "dataJob",
61
+ "glossaryTerm",
62
+ "glossaryNode",
63
+ "tag",
64
+ "role",
65
+ "corpuser",
66
+ "corpGroup",
67
+ "container",
68
+ "domain",
69
+ "dataProduct",
70
+ "notebook",
71
+ "businessAttribute",
72
+ "schemaField",
73
+ "query",
74
+ "dataProcessInstance",
75
+ ],
68
76
  description="List of entity types to cleanup",
69
77
  )
70
78
 
@@ -103,6 +111,9 @@ class SoftDeletedEntitiesReport(SourceReport):
103
111
  num_entities_found: Dict[str, int] = field(default_factory=dict)
104
112
  num_soft_deleted_entity_processed: int = 0
105
113
  num_soft_deleted_retained_due_to_age: int = 0
114
+ num_soft_deleted_retained_due_to_age_by_type: TopKDict[str, int] = field(
115
+ default_factory=TopKDict
116
+ )
106
117
  num_soft_deleted_entity_removal_started: int = 0
107
118
  num_hard_deleted: int = 0
108
119
  num_hard_deleted_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
@@ -111,6 +122,8 @@ class SoftDeletedEntitiesReport(SourceReport):
111
122
  )
112
123
  runtime_limit_reached: bool = False
113
124
  deletion_limit_reached: bool = False
125
+ num_soft_deleted_entity_found: int = 0
126
+ num_soft_deleted_entity_invalid_urn: int = 0
114
127
 
115
128
 
116
129
  class SoftDeletedEntitiesCleanup:
@@ -133,7 +146,7 @@ class SoftDeletedEntitiesCleanup:
133
146
  self.config = config
134
147
  self.report = report
135
148
  self.dry_run = dry_run
136
- self.start_time = 0.0
149
+ self.start_time = time.time()
137
150
  self._report_lock: Lock = Lock()
138
151
  self.last_print_time = 0.0
139
152
 
@@ -142,6 +155,14 @@ class SoftDeletedEntitiesCleanup:
142
155
  with self._report_lock:
143
156
  self.report.num_soft_deleted_retained_due_to_age += 1
144
157
 
158
+ def _increment_retained_by_type(self, type: str) -> None:
159
+ """Thread-safe method to update report fields"""
160
+ with self._report_lock:
161
+ self.report.num_soft_deleted_retained_due_to_age_by_type[type] = (
162
+ self.report.num_soft_deleted_retained_due_to_age_by_type.get(type, 0)
163
+ + 1
164
+ )
165
+
145
166
  def _increment_removal_started_count(self) -> None:
146
167
  """Thread-safe method to update report fields"""
147
168
  with self._report_lock:
@@ -160,10 +181,9 @@ class SoftDeletedEntitiesCleanup:
160
181
  )
161
182
  self.report.sample_hard_deleted_aspects_by_type[entity_type].append(urn)
162
183
 
163
- def delete_entity(self, urn: str) -> None:
184
+ def delete_entity(self, urn: Urn) -> None:
164
185
  assert self.ctx.graph
165
186
 
166
- entity_urn = Urn.from_string(urn)
167
187
  if self.dry_run:
168
188
  logger.info(
169
189
  f"Dry run is on otherwise it would have deleted {urn} with hard deletion"
@@ -172,14 +192,14 @@ class SoftDeletedEntitiesCleanup:
172
192
  if self._deletion_limit_reached() or self._times_up():
173
193
  return
174
194
  self._increment_removal_started_count()
175
- self.ctx.graph.delete_entity(urn=urn, hard=True)
195
+ self.ctx.graph.delete_entity(urn=urn.urn(), hard=True)
176
196
  self.ctx.graph.delete_references_to_urn(
177
- urn=urn,
197
+ urn=urn.urn(),
178
198
  dry_run=False,
179
199
  )
180
- self._update_report(urn, entity_urn.entity_type)
200
+ self._update_report(urn.urn(), urn.entity_type)
181
201
 
182
- def delete_soft_deleted_entity(self, urn: str) -> None:
202
+ def delete_soft_deleted_entity(self, urn: Urn) -> None:
183
203
  assert self.ctx.graph
184
204
 
185
205
  retention_time = (
@@ -187,7 +207,7 @@ class SoftDeletedEntitiesCleanup:
187
207
  - self.config.retention_days * 24 * 60 * 60
188
208
  )
189
209
 
190
- aspect = self.ctx.graph.get_entity_raw(entity_urn=urn, aspects=["status"])
210
+ aspect = self.ctx.graph.get_entity_raw(entity_urn=urn.urn(), aspects=["status"])
191
211
  if "status" in aspect["aspects"]:
192
212
  if aspect["aspects"]["status"]["value"]["removed"] and aspect["aspects"][
193
213
  "status"
@@ -196,6 +216,7 @@ class SoftDeletedEntitiesCleanup:
196
216
  self.delete_entity(urn)
197
217
  else:
198
218
  self._increment_retained_count()
219
+ self._increment_retained_by_type(urn.entity_type)
199
220
 
200
221
  def _print_report(self) -> None:
201
222
  time_taken = round(time.time() - self.last_print_time, 1)
@@ -204,7 +225,7 @@ class SoftDeletedEntitiesCleanup:
204
225
  self.last_print_time = time.time()
205
226
  logger.info(f"\n{self.report.as_string()}")
206
227
 
207
- def _process_futures(self, futures: Dict[Future, str]) -> Dict[Future, str]:
228
+ def _process_futures(self, futures: Dict[Future, Urn]) -> Dict[Future, Urn]:
208
229
  done, not_done = wait(futures, return_when=FIRST_COMPLETED)
209
230
  futures = {future: urn for future, urn in futures.items() if future in not_done}
210
231
 
@@ -214,7 +235,7 @@ class SoftDeletedEntitiesCleanup:
214
235
  self.report.failure(
215
236
  title="Failed to delete entity",
216
237
  message="Failed to delete entity",
217
- context=futures[future],
238
+ context=futures[future].urn(),
218
239
  exc=future.exception(),
219
240
  )
220
241
  self.report.num_soft_deleted_entity_processed += 1
@@ -229,86 +250,52 @@ class SoftDeletedEntitiesCleanup:
229
250
  time.sleep(self.config.delay)
230
251
  return futures
231
252
 
232
- def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
253
+ def _get_urns(self) -> Iterable[str]:
233
254
  assert self.ctx.graph
234
- scroll_id: Optional[str] = None
235
-
236
- batch_size = self.config.batch_size
237
- if entity_type == "DATA_PROCESS_INSTANCE":
238
- # Due to a bug in Data process instance querying this is a temp workaround
239
- # to avoid a giant stacktrace by having a smaller batch size in first call
240
- # This will be remove in future version after server with fix has been
241
- # around for a while
242
- batch_size = 10
243
-
244
- while True:
245
- try:
246
- if entity_type not in self.report.num_calls_made:
247
- self.report.num_calls_made[entity_type] = 1
248
- else:
249
- self.report.num_calls_made[entity_type] += 1
250
- self._print_report()
251
- result = self.ctx.graph.execute_graphql(
252
- graphql_query,
253
- {
254
- "input": {
255
- "types": [entity_type],
256
- "query": "*",
257
- "scrollId": scroll_id if scroll_id else None,
258
- "count": batch_size,
259
- "orFilters": [
260
- {
261
- "and": [
262
- {
263
- "field": "removed",
264
- "values": ["true"],
265
- "condition": "EQUAL",
266
- }
267
- ]
268
- }
269
- ],
270
- }
271
- },
272
- )
273
- except Exception as e:
274
- self.report.failure(
275
- f"While trying to get {entity_type} with {scroll_id}", exc=e
276
- )
277
- break
278
- scroll_across_entities = result.get("scrollAcrossEntities")
279
- if not scroll_across_entities:
280
- break
281
- search_results = scroll_across_entities.get("searchResults")
282
- count = scroll_across_entities.get("count")
283
- if not count or not search_results:
284
- # Due to a server bug we cannot rely on just count as it was returning response like this
285
- # {'count': 1, 'nextScrollId': None, 'searchResults': []}
286
- break
287
- if entity_type == "DATA_PROCESS_INSTANCE":
288
- # Temp workaround. See note in beginning of the function
289
- # We make the batch size = config after call has succeeded once
290
- batch_size = self.config.batch_size
291
- scroll_id = scroll_across_entities.get("nextScrollId")
292
- if entity_type not in self.report.num_entities_found:
293
- self.report.num_entities_found[entity_type] = 0
294
- self.report.num_entities_found[entity_type] += scroll_across_entities.get(
295
- "count"
255
+ # Entities created in the retention period are not considered for deletion
256
+ created_from = int(
257
+ (
258
+ datetime.now(timezone.utc).timestamp()
259
+ - self.config.retention_days * 24 * 60 * 60
296
260
  )
297
- for query in search_results:
298
- yield query["entity"]["urn"]
261
+ * 1000
262
+ )
263
+
264
+ entity_types = self.config.entity_types
265
+ # dataProcessInstance is a special case where we need to get the entities separately
266
+ # because we need to filter based on created time we don't stream to many dataProcessInstance entities at once
267
+ # Gc source soft-deletes dataProcessInstance entities which causes to have a lot of soft deleted entities
268
+ if (
269
+ self.config.entity_types
270
+ and "dataProcessInstance" in self.config.entity_types
271
+ ):
272
+ entity_types = self.config.entity_types.copy()
273
+ yield from self.ctx.graph.get_urns_by_filter(
274
+ entity_types=["dataProcessInstance"],
275
+ platform=self.config.platform,
276
+ env=self.config.env,
277
+ query=self.config.query,
278
+ status=RemovedStatusFilter.ONLY_SOFT_DELETED,
279
+ batch_size=self.config.batch_size,
280
+ extraFilters=[
281
+ SearchFilterRule(
282
+ field="created",
283
+ condition="LESS_THAN",
284
+ values=[f"{created_from}"],
285
+ ).to_raw()
286
+ ],
287
+ )
288
+
289
+ entity_types.remove("dataProcessInstance")
299
290
 
300
- def _get_urns(self) -> Iterable[str]:
301
- assert self.ctx.graph
302
291
  yield from self.ctx.graph.get_urns_by_filter(
303
- entity_types=self.config.entity_types,
292
+ entity_types=entity_types,
304
293
  platform=self.config.platform,
305
294
  env=self.config.env,
306
295
  query=self.config.query,
307
296
  status=RemovedStatusFilter.ONLY_SOFT_DELETED,
308
297
  batch_size=self.config.batch_size,
309
298
  )
310
- yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
311
- yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
312
299
 
313
300
  def _times_up(self) -> bool:
314
301
  if (
@@ -335,16 +322,26 @@ class SoftDeletedEntitiesCleanup:
335
322
  return
336
323
  self.start_time = time.time()
337
324
 
338
- futures: Dict[Future, str] = dict()
325
+ futures: Dict[Future, Urn] = dict()
339
326
  with ThreadPoolExecutor(max_workers=self.config.max_workers) as executor:
340
327
  for urn in self._get_urns():
328
+ try:
329
+ self.report.num_soft_deleted_entity_found += 1
330
+ soft_deleted_urn = Urn.from_string(urn)
331
+ except InvalidUrnError as e:
332
+ logger.error(f"Failed to parse urn {urn} with error {e}")
333
+ self.report.num_soft_deleted_entity_invalid_urn += 1
334
+ continue
335
+
341
336
  self._print_report()
342
337
  while len(futures) >= self.config.futures_max_at_time:
343
338
  futures = self._process_futures(futures)
344
339
  if self._deletion_limit_reached() or self._times_up():
345
340
  break
346
- future = executor.submit(self.delete_soft_deleted_entity, urn)
347
- futures[future] = urn
341
+ future = executor.submit(
342
+ self.delete_soft_deleted_entity, soft_deleted_urn
343
+ )
344
+ futures[future] = soft_deleted_urn
348
345
 
349
346
  logger.info(f"Waiting for {len(futures)} futures to complete")
350
347
  while len(futures) > 0:
@@ -602,7 +602,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
602
602
  if not self.config.include_field_median_value:
603
603
  return
604
604
  try:
605
- if self.dataset.engine.dialect.name.lower() in [SNOWFLAKE, DATABRICKS]:
605
+ if self.dataset.engine.dialect.name.lower() == SNOWFLAKE:
606
606
  column_profile.median = str(
607
607
  self.dataset.engine.execute(
608
608
  sa.select([sa.func.median(sa.column(column))]).select_from(
@@ -610,6 +610,16 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase):
610
610
  )
611
611
  ).scalar()
612
612
  )
613
+ elif self.dataset.engine.dialect.name.lower() == DATABRICKS:
614
+ column_profile.median = str(
615
+ self.dataset.engine.execute(
616
+ sa.select(
617
+ sa.text(
618
+ f"approx_percentile(`{column}`, 0.5) as approx_median"
619
+ )
620
+ ).select_from(self.dataset._table)
621
+ ).scalar()
622
+ )
613
623
  elif self.dataset.engine.dialect.name.lower() == BIGQUERY:
614
624
  column_profile.median = str(
615
625
  self.dataset.engine.execute(
File without changes