acryl-datahub 0.15.0rc24__py3-none-any.whl → 0.15.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/METADATA +2408 -2412
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/RECORD +116 -106
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/WHEEL +1 -1
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/entry_points.txt +1 -1
- datahub/__init__.py +1 -1
- datahub/api/circuit_breaker/assertion_circuit_breaker.py +5 -4
- datahub/api/entities/structuredproperties/structuredproperties.py +20 -8
- datahub/configuration/common.py +2 -5
- datahub/configuration/source_common.py +13 -0
- datahub/emitter/mce_builder.py +20 -4
- datahub/emitter/mcp_builder.py +2 -7
- datahub/emitter/mcp_patch_builder.py +37 -13
- datahub/emitter/rest_emitter.py +25 -3
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +98 -0
- datahub/ingestion/api/closeable.py +3 -3
- datahub/ingestion/api/ingestion_job_checkpointing_provider_base.py +4 -7
- datahub/ingestion/api/report.py +4 -1
- datahub/ingestion/api/sink.py +4 -3
- datahub/ingestion/api/source.py +4 -0
- datahub/ingestion/api/source_helpers.py +2 -6
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +44 -1
- datahub/ingestion/source/aws/aws_common.py +231 -27
- datahub/ingestion/source/aws/glue.py +12 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +10 -18
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +3 -9
- datahub/ingestion/source/bigquery_v2/bigquery_schema.py +5 -20
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +11 -17
- datahub/ingestion/source/bigquery_v2/lineage.py +9 -22
- datahub/ingestion/source/datahub/config.py +22 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +3 -17
- datahub/ingestion/source/datahub/datahub_kafka_reader.py +2 -1
- datahub/ingestion/source/datahub/datahub_source.py +1 -1
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -3
- datahub/ingestion/source/gc/datahub_gc.py +21 -5
- datahub/ingestion/source/gc/dataprocess_cleanup.py +23 -10
- datahub/ingestion/source/gc/execution_request_cleanup.py +61 -16
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +178 -83
- datahub/ingestion/source/iceberg/iceberg.py +27 -1
- datahub/ingestion/source/iceberg/iceberg_common.py +4 -0
- datahub/ingestion/source/kafka_connect/__init__.py +0 -0
- datahub/ingestion/source/kafka_connect/common.py +202 -0
- datahub/ingestion/source/kafka_connect/kafka_connect.py +367 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +341 -0
- datahub/ingestion/source/kafka_connect/source_connectors.py +570 -0
- datahub/ingestion/source/looker/looker_common.py +63 -2
- datahub/ingestion/source/looker/looker_dataclasses.py +7 -9
- datahub/ingestion/source/looker/looker_lib_wrapper.py +13 -1
- datahub/ingestion/source/looker/looker_source.py +31 -4
- datahub/ingestion/source/looker/looker_usage.py +23 -17
- datahub/ingestion/source/mlflow.py +30 -5
- datahub/ingestion/source/mode.py +40 -27
- datahub/ingestion/source/powerbi/config.py +1 -14
- datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py +1 -1
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +1 -1
- datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +16 -2
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +16 -15
- datahub/ingestion/source/s3/source.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +13 -34
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +14 -64
- datahub/ingestion/source/snowflake/snowflake_queries.py +44 -14
- datahub/ingestion/source/snowflake/snowflake_query.py +5 -10
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +53 -7
- datahub/ingestion/source/snowflake/snowflake_shares.py +1 -1
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +2 -5
- datahub/ingestion/source/snowflake/snowflake_utils.py +22 -18
- datahub/ingestion/source/snowflake/snowflake_v2.py +38 -34
- datahub/ingestion/source/sql/hive.py +621 -8
- datahub/ingestion/source/sql/hive_metastore.py +7 -0
- datahub/ingestion/source/sql/mssql/job_models.py +30 -1
- datahub/ingestion/source/sql/mssql/source.py +15 -1
- datahub/ingestion/source/sql/sql_common.py +41 -102
- datahub/ingestion/source/sql/sql_generic_profiler.py +5 -6
- datahub/ingestion/source/sql/sql_report.py +2 -0
- datahub/ingestion/source/state/checkpoint.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +122 -45
- datahub/ingestion/source/tableau/tableau_common.py +18 -0
- datahub/ingestion/source/tableau/tableau_constant.py +3 -1
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +6 -2
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/proxy.py +8 -27
- datahub/ingestion/source/usage/usage_common.py +15 -1
- datahub/ingestion/source_report/ingestion_stage.py +3 -0
- datahub/metadata/_schema_classes.py +256 -3
- datahub/metadata/_urns/urn_defs.py +168 -168
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/ml/metadata/__init__.py +2 -0
- datahub/metadata/schema.avsc +252 -33
- datahub/metadata/schemas/DataJobKey.avsc +2 -1
- datahub/metadata/schemas/DataProcessInstanceKey.avsc +5 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -1
- datahub/metadata/schemas/DataTransformLogic.avsc +63 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +82 -0
- datahub/metadata/schemas/MLModelProperties.avsc +62 -2
- datahub/metadata/schemas/MLTrainingRunProperties.avsc +171 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +94 -2
- datahub/specific/aspect_helpers/__init__.py +0 -0
- datahub/specific/aspect_helpers/custom_properties.py +79 -0
- datahub/specific/aspect_helpers/ownership.py +67 -0
- datahub/specific/aspect_helpers/structured_properties.py +72 -0
- datahub/specific/aspect_helpers/tags.py +42 -0
- datahub/specific/aspect_helpers/terms.py +43 -0
- datahub/specific/chart.py +28 -184
- datahub/specific/dashboard.py +31 -196
- datahub/specific/datajob.py +34 -189
- datahub/specific/dataproduct.py +24 -86
- datahub/specific/dataset.py +48 -133
- datahub/specific/form.py +12 -32
- datahub/specific/structured_property.py +9 -9
- datahub/sql_parsing/sql_parsing_aggregator.py +10 -9
- datahub/sql_parsing/sqlglot_lineage.py +15 -5
- datahub/sql_parsing/tool_meta_extractor.py +119 -5
- datahub/utilities/time.py +8 -3
- datahub/utilities/urns/_urn_base.py +5 -7
- datahub/ingestion/source/kafka/kafka_connect.py +0 -1468
- datahub/specific/custom_properties.py +0 -37
- datahub/specific/ownership.py +0 -48
- datahub/specific/structured_properties.py +0 -53
- {acryl_datahub-0.15.0rc24.dist-info → acryl_datahub-0.15.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import Optional, Set
|
|
3
3
|
|
|
4
|
+
import pydantic
|
|
4
5
|
from pydantic import Field, root_validator
|
|
5
6
|
|
|
6
7
|
from datahub.configuration.common import AllowDenyPattern
|
|
@@ -14,6 +15,17 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
14
15
|
DEFAULT_DATABASE_TABLE_NAME = "metadata_aspect_v2"
|
|
15
16
|
DEFAULT_KAFKA_TOPIC_NAME = "MetadataChangeLog_Timeseries_v1"
|
|
16
17
|
DEFAULT_DATABASE_BATCH_SIZE = 10_000
|
|
18
|
+
DEFAULT_EXCLUDE_ASPECTS = {
|
|
19
|
+
"dataHubIngestionSourceKey",
|
|
20
|
+
"dataHubIngestionSourceInfo",
|
|
21
|
+
"datahubIngestionRunSummary",
|
|
22
|
+
"datahubIngestionCheckpoint",
|
|
23
|
+
"dataHubSecretKey",
|
|
24
|
+
"dataHubSecretValue",
|
|
25
|
+
"globalSettingsKey",
|
|
26
|
+
"globalSettingsInfo",
|
|
27
|
+
"testResults",
|
|
28
|
+
}
|
|
17
29
|
|
|
18
30
|
|
|
19
31
|
class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
@@ -44,7 +56,7 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
44
56
|
)
|
|
45
57
|
|
|
46
58
|
exclude_aspects: Set[str] = Field(
|
|
47
|
-
|
|
59
|
+
default=DEFAULT_EXCLUDE_ASPECTS,
|
|
48
60
|
description="Set of aspect names to exclude from ingestion",
|
|
49
61
|
)
|
|
50
62
|
|
|
@@ -108,3 +120,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
|
|
|
108
120
|
" Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
|
|
109
121
|
)
|
|
110
122
|
return values
|
|
123
|
+
|
|
124
|
+
@pydantic.validator("database_connection")
|
|
125
|
+
def validate_mysql_scheme(
|
|
126
|
+
cls, v: SQLAlchemyConnectionConfig
|
|
127
|
+
) -> SQLAlchemyConnectionConfig:
|
|
128
|
+
if "mysql" in v.scheme:
|
|
129
|
+
if v.scheme != "mysql+pymysql":
|
|
130
|
+
raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
|
|
131
|
+
return v
|
|
@@ -151,8 +151,10 @@ class DataHubDatabaseReader:
|
|
|
151
151
|
self, query: str, params: Dict[str, Any]
|
|
152
152
|
) -> Iterable[Dict[str, Any]]:
|
|
153
153
|
with self.engine.connect() as conn:
|
|
154
|
-
if self.engine.dialect.name
|
|
154
|
+
if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
|
|
155
155
|
with conn.begin(): # Transaction required for PostgreSQL server-side cursor
|
|
156
|
+
# Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
|
|
157
|
+
# https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
|
|
156
158
|
conn = conn.execution_options(
|
|
157
159
|
stream_results=True,
|
|
158
160
|
yield_per=self.config.database_query_batch_size,
|
|
@@ -160,22 +162,6 @@ class DataHubDatabaseReader:
|
|
|
160
162
|
result = conn.execute(query, params)
|
|
161
163
|
for row in result:
|
|
162
164
|
yield dict(row)
|
|
163
|
-
elif self.engine.dialect.name == "mysql": # MySQL
|
|
164
|
-
import MySQLdb
|
|
165
|
-
|
|
166
|
-
with contextlib.closing(
|
|
167
|
-
conn.connection.cursor(MySQLdb.cursors.SSCursor)
|
|
168
|
-
) as cursor:
|
|
169
|
-
logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
|
|
170
|
-
cursor.execute(query, params)
|
|
171
|
-
|
|
172
|
-
columns = [desc[0] for desc in cursor.description]
|
|
173
|
-
while True:
|
|
174
|
-
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
175
|
-
if not rows:
|
|
176
|
-
break # Use break instead of return in generator
|
|
177
|
-
for row in rows:
|
|
178
|
-
yield dict(zip(columns, row))
|
|
179
165
|
else:
|
|
180
166
|
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
|
|
181
167
|
|
|
@@ -12,6 +12,7 @@ from confluent_kafka.schema_registry import SchemaRegistryClient
|
|
|
12
12
|
from confluent_kafka.schema_registry.avro import AvroDeserializer
|
|
13
13
|
|
|
14
14
|
from datahub.configuration.kafka import KafkaConsumerConnectionConfig
|
|
15
|
+
from datahub.emitter.mce_builder import parse_ts_millis
|
|
15
16
|
from datahub.ingestion.api.closeable import Closeable
|
|
16
17
|
from datahub.ingestion.api.common import PipelineContext
|
|
17
18
|
from datahub.ingestion.source.datahub.config import DataHubSourceConfig
|
|
@@ -92,7 +93,7 @@ class DataHubKafkaReader(Closeable):
|
|
|
92
93
|
if mcl.created and mcl.created.time > stop_time.timestamp() * 1000:
|
|
93
94
|
logger.info(
|
|
94
95
|
f"Stopped reading from kafka, reached MCL "
|
|
95
|
-
f"with audit stamp {
|
|
96
|
+
f"with audit stamp {parse_ts_millis(mcl.created.time)}"
|
|
96
97
|
)
|
|
97
98
|
break
|
|
98
99
|
|
|
@@ -130,7 +130,7 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
130
130
|
self._commit_progress(i)
|
|
131
131
|
|
|
132
132
|
def _get_kafka_workunits(
|
|
133
|
-
self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
|
|
133
|
+
self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
|
|
134
134
|
) -> Iterable[MetadataWorkUnit]:
|
|
135
135
|
if self.config.kafka_connection is None:
|
|
136
136
|
return
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from json import JSONDecodeError
|
|
4
|
-
from typing import Dict, List, Optional, Tuple
|
|
4
|
+
from typing import Dict, List, Literal, Optional, Tuple
|
|
5
5
|
from urllib.parse import urlparse
|
|
6
6
|
|
|
7
7
|
import dateutil.parser
|
|
@@ -62,6 +62,11 @@ class DBTCloudConfig(DBTCommonConfig):
|
|
|
62
62
|
description="The ID of the run to ingest metadata from. If not specified, we'll default to the latest run.",
|
|
63
63
|
)
|
|
64
64
|
|
|
65
|
+
external_url_mode: Literal["explore", "ide"] = Field(
|
|
66
|
+
default="explore",
|
|
67
|
+
description='Where should the "View in dbt" link point to - either the "Explore" UI or the dbt Cloud IDE',
|
|
68
|
+
)
|
|
69
|
+
|
|
65
70
|
@root_validator(pre=True)
|
|
66
71
|
def set_metadata_endpoint(cls, values: dict) -> dict:
|
|
67
72
|
if values.get("access_url") and not values.get("metadata_endpoint"):
|
|
@@ -527,5 +532,7 @@ class DBTCloudSource(DBTSourceBase, TestableSource):
|
|
|
527
532
|
)
|
|
528
533
|
|
|
529
534
|
def get_external_url(self, node: DBTNode) -> Optional[str]:
|
|
530
|
-
|
|
531
|
-
|
|
535
|
+
if self.config.external_url_mode == "explore":
|
|
536
|
+
return f"{self.config.access_url}/explore/{self.config.account_id}/projects/{self.config.project_id}/environments/production/details/{node.dbt_name}"
|
|
537
|
+
else:
|
|
538
|
+
return f"{self.config.access_url}/develop/{self.config.account_id}/projects/{self.config.project_id}"
|
|
@@ -34,6 +34,7 @@ from datahub.ingestion.source.gc.soft_deleted_entity_cleanup import (
|
|
|
34
34
|
SoftDeletedEntitiesCleanupConfig,
|
|
35
35
|
SoftDeletedEntitiesReport,
|
|
36
36
|
)
|
|
37
|
+
from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport
|
|
37
38
|
|
|
38
39
|
logger = logging.getLogger(__name__)
|
|
39
40
|
|
|
@@ -86,6 +87,7 @@ class DataHubGcSourceReport(
|
|
|
86
87
|
DataProcessCleanupReport,
|
|
87
88
|
SoftDeletedEntitiesReport,
|
|
88
89
|
DatahubExecutionRequestCleanupReport,
|
|
90
|
+
IngestionStageReport,
|
|
89
91
|
):
|
|
90
92
|
expired_tokens_revoked: int = 0
|
|
91
93
|
|
|
@@ -139,31 +141,40 @@ class DataHubGcSource(Source):
|
|
|
139
141
|
) -> Iterable[MetadataWorkUnit]:
|
|
140
142
|
if self.config.cleanup_expired_tokens:
|
|
141
143
|
try:
|
|
144
|
+
self.report.report_ingestion_stage_start("Expired Token Cleanup")
|
|
142
145
|
self.revoke_expired_tokens()
|
|
143
146
|
except Exception as e:
|
|
144
147
|
self.report.failure("While trying to cleanup expired token ", exc=e)
|
|
145
148
|
if self.config.truncate_indices:
|
|
146
149
|
try:
|
|
150
|
+
self.report.report_ingestion_stage_start("Truncate Indices")
|
|
147
151
|
self.truncate_indices()
|
|
148
152
|
except Exception as e:
|
|
149
153
|
self.report.failure("While trying to truncate indices ", exc=e)
|
|
150
154
|
if self.config.soft_deleted_entities_cleanup.enabled:
|
|
151
155
|
try:
|
|
156
|
+
self.report.report_ingestion_stage_start(
|
|
157
|
+
"Soft Deleted Entities Cleanup"
|
|
158
|
+
)
|
|
152
159
|
self.soft_deleted_entities_cleanup.cleanup_soft_deleted_entities()
|
|
153
160
|
except Exception as e:
|
|
154
161
|
self.report.failure(
|
|
155
162
|
"While trying to cleanup soft deleted entities ", exc=e
|
|
156
163
|
)
|
|
157
|
-
if self.config.execution_request_cleanup.enabled:
|
|
158
|
-
try:
|
|
159
|
-
self.execution_request_cleanup.run()
|
|
160
|
-
except Exception as e:
|
|
161
|
-
self.report.failure("While trying to cleanup execution request ", exc=e)
|
|
162
164
|
if self.config.dataprocess_cleanup.enabled:
|
|
163
165
|
try:
|
|
166
|
+
self.report.report_ingestion_stage_start("Data Process Cleanup")
|
|
164
167
|
yield from self.dataprocess_cleanup.get_workunits_internal()
|
|
165
168
|
except Exception as e:
|
|
166
169
|
self.report.failure("While trying to cleanup data process ", exc=e)
|
|
170
|
+
if self.config.execution_request_cleanup.enabled:
|
|
171
|
+
try:
|
|
172
|
+
self.report.report_ingestion_stage_start("Execution request Cleanup")
|
|
173
|
+
self.execution_request_cleanup.run()
|
|
174
|
+
except Exception as e:
|
|
175
|
+
self.report.failure("While trying to cleanup execution request ", exc=e)
|
|
176
|
+
# Otherwise last stage's duration does not get calculated.
|
|
177
|
+
self.report.report_ingestion_stage_start("End")
|
|
167
178
|
yield from []
|
|
168
179
|
|
|
169
180
|
def truncate_indices(self) -> None:
|
|
@@ -177,6 +188,9 @@ class DataHubGcSource(Source):
|
|
|
177
188
|
self._truncate_timeseries_helper(
|
|
178
189
|
aspect_name="dashboardUsageStatistics", entity_type="dashboard"
|
|
179
190
|
)
|
|
191
|
+
self._truncate_timeseries_helper(
|
|
192
|
+
aspect_name="queryusagestatistics", entity_type="query"
|
|
193
|
+
)
|
|
180
194
|
|
|
181
195
|
def _truncate_timeseries_helper(self, aspect_name: str, entity_type: str) -> None:
|
|
182
196
|
self._truncate_timeseries_with_watch_optional(
|
|
@@ -281,6 +295,8 @@ class DataHubGcSource(Source):
|
|
|
281
295
|
list_access_tokens = expired_tokens_res.get("listAccessTokens", {})
|
|
282
296
|
tokens = list_access_tokens.get("tokens", [])
|
|
283
297
|
total = list_access_tokens.get("total", 0)
|
|
298
|
+
if tokens == []:
|
|
299
|
+
break
|
|
284
300
|
for token in tokens:
|
|
285
301
|
self.report.expired_tokens_revoked += 1
|
|
286
302
|
token_id = token["id"]
|
|
@@ -167,9 +167,11 @@ class DataJobEntity:
|
|
|
167
167
|
class DataProcessCleanupReport(SourceReport):
|
|
168
168
|
num_aspects_removed: int = 0
|
|
169
169
|
num_aspect_removed_by_type: TopKDict[str, int] = field(default_factory=TopKDict)
|
|
170
|
-
|
|
170
|
+
sample_soft_deleted_aspects_by_type: TopKDict[str, LossyList[str]] = field(
|
|
171
171
|
default_factory=TopKDict
|
|
172
172
|
)
|
|
173
|
+
num_data_flows_found: int = 0
|
|
174
|
+
num_data_jobs_found: int = 0
|
|
173
175
|
|
|
174
176
|
|
|
175
177
|
class DataProcessCleanup:
|
|
@@ -265,13 +267,17 @@ class DataProcessCleanup:
|
|
|
265
267
|
self.report.report_failure(
|
|
266
268
|
f"Exception while deleting DPI: {e}", exc=e
|
|
267
269
|
)
|
|
268
|
-
if
|
|
270
|
+
if (
|
|
271
|
+
deleted_count_last_n % self.config.batch_size == 0
|
|
272
|
+
and deleted_count_last_n > 0
|
|
273
|
+
):
|
|
269
274
|
logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
|
|
270
275
|
if self.config.delay:
|
|
271
276
|
logger.info(f"Sleeping for {self.config.delay} seconds")
|
|
272
277
|
time.sleep(self.config.delay)
|
|
273
278
|
|
|
274
|
-
|
|
279
|
+
if deleted_count_last_n > 0:
|
|
280
|
+
logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
|
|
275
281
|
|
|
276
282
|
def delete_entity(self, urn: str, type: str) -> None:
|
|
277
283
|
assert self.ctx.graph
|
|
@@ -280,9 +286,9 @@ class DataProcessCleanup:
|
|
|
280
286
|
self.report.num_aspect_removed_by_type[type] = (
|
|
281
287
|
self.report.num_aspect_removed_by_type.get(type, 0) + 1
|
|
282
288
|
)
|
|
283
|
-
if type not in self.report.
|
|
284
|
-
self.report.
|
|
285
|
-
self.report.
|
|
289
|
+
if type not in self.report.sample_soft_deleted_aspects_by_type:
|
|
290
|
+
self.report.sample_soft_deleted_aspects_by_type[type] = LossyList()
|
|
291
|
+
self.report.sample_soft_deleted_aspects_by_type[type].append(urn)
|
|
286
292
|
|
|
287
293
|
if self.dry_run:
|
|
288
294
|
logger.info(
|
|
@@ -351,7 +357,10 @@ class DataProcessCleanup:
|
|
|
351
357
|
except Exception as e:
|
|
352
358
|
self.report.report_failure(f"Exception while deleting DPI: {e}", exc=e)
|
|
353
359
|
|
|
354
|
-
if
|
|
360
|
+
if (
|
|
361
|
+
deleted_count_retention % self.config.batch_size == 0
|
|
362
|
+
and deleted_count_retention > 0
|
|
363
|
+
):
|
|
355
364
|
logger.info(
|
|
356
365
|
f"Deleted {deleted_count_retention} DPIs from {job.urn} due to retention"
|
|
357
366
|
)
|
|
@@ -393,6 +402,7 @@ class DataProcessCleanup:
|
|
|
393
402
|
scrollAcrossEntities = result.get("scrollAcrossEntities")
|
|
394
403
|
if not scrollAcrossEntities:
|
|
395
404
|
raise ValueError("Missing scrollAcrossEntities in response")
|
|
405
|
+
self.report.num_data_flows_found += scrollAcrossEntities.get("count")
|
|
396
406
|
logger.info(f"Got {scrollAcrossEntities.get('count')} DataFlow entities")
|
|
397
407
|
|
|
398
408
|
scroll_id = scrollAcrossEntities.get("nextScrollId")
|
|
@@ -415,8 +425,9 @@ class DataProcessCleanup:
|
|
|
415
425
|
assert self.ctx.graph
|
|
416
426
|
|
|
417
427
|
dataFlows: Dict[str, DataFlowEntity] = {}
|
|
418
|
-
|
|
419
|
-
|
|
428
|
+
if self.config.delete_empty_data_flows:
|
|
429
|
+
for flow in self.get_data_flows():
|
|
430
|
+
dataFlows[flow.urn] = flow
|
|
420
431
|
|
|
421
432
|
scroll_id: Optional[str] = None
|
|
422
433
|
previous_scroll_id: Optional[str] = None
|
|
@@ -443,6 +454,7 @@ class DataProcessCleanup:
|
|
|
443
454
|
if not scrollAcrossEntities:
|
|
444
455
|
raise ValueError("Missing scrollAcrossEntities in response")
|
|
445
456
|
|
|
457
|
+
self.report.num_data_jobs_found += scrollAcrossEntities.get("count")
|
|
446
458
|
logger.info(f"Got {scrollAcrossEntities.get('count')} DataJob entities")
|
|
447
459
|
|
|
448
460
|
scroll_id = scrollAcrossEntities.get("nextScrollId")
|
|
@@ -481,7 +493,8 @@ class DataProcessCleanup:
|
|
|
481
493
|
|
|
482
494
|
previous_scroll_id = scroll_id
|
|
483
495
|
|
|
484
|
-
|
|
496
|
+
if deleted_jobs > 0:
|
|
497
|
+
logger.info(f"Deleted {deleted_jobs} DataJobs")
|
|
485
498
|
# Delete empty dataflows if needed
|
|
486
499
|
if self.config.delete_empty_data_flows:
|
|
487
500
|
deleted_data_flows: int = 0
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import datetime
|
|
1
2
|
import logging
|
|
2
3
|
import time
|
|
3
4
|
from typing import Any, Dict, Iterator, Optional
|
|
@@ -42,16 +43,28 @@ class DatahubExecutionRequestCleanupConfig(ConfigModel):
|
|
|
42
43
|
description="Global switch for this cleanup task",
|
|
43
44
|
)
|
|
44
45
|
|
|
46
|
+
runtime_limit_seconds: int = Field(
|
|
47
|
+
default=3600,
|
|
48
|
+
description="Maximum runtime in seconds for the cleanup task",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
max_read_errors: int = Field(
|
|
52
|
+
default=10,
|
|
53
|
+
description="Maximum number of read errors before aborting",
|
|
54
|
+
)
|
|
55
|
+
|
|
45
56
|
def keep_history_max_milliseconds(self):
|
|
46
57
|
return self.keep_history_max_days * 24 * 3600 * 1000
|
|
47
58
|
|
|
48
59
|
|
|
49
60
|
class DatahubExecutionRequestCleanupReport(SourceReport):
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
61
|
+
ergc_records_read: int = 0
|
|
62
|
+
ergc_records_preserved: int = 0
|
|
63
|
+
ergc_records_deleted: int = 0
|
|
64
|
+
ergc_read_errors: int = 0
|
|
65
|
+
ergc_delete_errors: int = 0
|
|
66
|
+
ergc_start_time: Optional[datetime.datetime] = None
|
|
67
|
+
ergc_end_time: Optional[datetime.datetime] = None
|
|
55
68
|
|
|
56
69
|
|
|
57
70
|
class CleanupRecord(BaseModel):
|
|
@@ -124,6 +137,15 @@ class DatahubExecutionRequestCleanup:
|
|
|
124
137
|
params.update(overrides)
|
|
125
138
|
|
|
126
139
|
while True:
|
|
140
|
+
if self._reached_runtime_limit():
|
|
141
|
+
break
|
|
142
|
+
if self.report.ergc_read_errors >= self.config.max_read_errors:
|
|
143
|
+
self.report.failure(
|
|
144
|
+
title="Too many read errors, aborting",
|
|
145
|
+
message="Too many read errors, aborting",
|
|
146
|
+
context=str(self.instance_id),
|
|
147
|
+
)
|
|
148
|
+
break
|
|
127
149
|
try:
|
|
128
150
|
url = f"{self.graph.config.server}/openapi/v2/entity/{DATAHUB_EXECUTION_REQUEST_ENTITY_NAME}"
|
|
129
151
|
response = self.graph._session.get(url, headers=headers, params=params)
|
|
@@ -138,10 +160,13 @@ class DatahubExecutionRequestCleanup:
|
|
|
138
160
|
break
|
|
139
161
|
params["scrollId"] = document["scrollId"]
|
|
140
162
|
except Exception as e:
|
|
141
|
-
|
|
142
|
-
|
|
163
|
+
self.report.failure(
|
|
164
|
+
title="Failed to fetch next batch of execution requests",
|
|
165
|
+
message="Failed to fetch next batch of execution requests",
|
|
166
|
+
context=str(self.instance_id),
|
|
167
|
+
exc=e,
|
|
143
168
|
)
|
|
144
|
-
self.report.
|
|
169
|
+
self.report.ergc_read_errors += 1
|
|
145
170
|
|
|
146
171
|
def _scroll_garbage_records(self):
|
|
147
172
|
state: Dict[str, Dict] = {}
|
|
@@ -150,7 +175,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
150
175
|
running_guard_timeout = now_ms - 30 * 24 * 3600 * 1000
|
|
151
176
|
|
|
152
177
|
for entry in self._scroll_execution_requests():
|
|
153
|
-
self.report.
|
|
178
|
+
self.report.ergc_records_read += 1
|
|
154
179
|
key = entry.ingestion_source
|
|
155
180
|
|
|
156
181
|
# Always delete corrupted records
|
|
@@ -171,7 +196,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
171
196
|
|
|
172
197
|
# Do not delete if number of requests is below minimum
|
|
173
198
|
if state[key]["count"] < self.config.keep_history_min_count:
|
|
174
|
-
self.report.
|
|
199
|
+
self.report.ergc_records_preserved += 1
|
|
175
200
|
continue
|
|
176
201
|
|
|
177
202
|
# Do not delete if number of requests do not exceed allowed maximum,
|
|
@@ -179,7 +204,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
179
204
|
if (state[key]["count"] < self.config.keep_history_max_count) and (
|
|
180
205
|
entry.requested_at > state[key]["cutoffTimestamp"]
|
|
181
206
|
):
|
|
182
|
-
self.report.
|
|
207
|
+
self.report.ergc_records_preserved += 1
|
|
183
208
|
continue
|
|
184
209
|
|
|
185
210
|
# Do not delete if status is RUNNING or PENDING and created within last month. If the record is >month old and it did not
|
|
@@ -188,7 +213,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
188
213
|
"RUNNING",
|
|
189
214
|
"PENDING",
|
|
190
215
|
]:
|
|
191
|
-
self.report.
|
|
216
|
+
self.report.ergc_records_preserved += 1
|
|
192
217
|
continue
|
|
193
218
|
|
|
194
219
|
# Otherwise delete current record
|
|
@@ -200,7 +225,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
200
225
|
f"record timestamp: {entry.requested_at}."
|
|
201
226
|
)
|
|
202
227
|
)
|
|
203
|
-
self.report.
|
|
228
|
+
self.report.ergc_records_deleted += 1
|
|
204
229
|
yield entry
|
|
205
230
|
|
|
206
231
|
def _delete_entry(self, entry: CleanupRecord) -> None:
|
|
@@ -210,10 +235,26 @@ class DatahubExecutionRequestCleanup:
|
|
|
210
235
|
)
|
|
211
236
|
self.graph.delete_entity(entry.urn, True)
|
|
212
237
|
except Exception as e:
|
|
213
|
-
self.report.
|
|
214
|
-
|
|
215
|
-
|
|
238
|
+
self.report.ergc_delete_errors += 1
|
|
239
|
+
self.report.failure(
|
|
240
|
+
title="Failed to delete ExecutionRequest",
|
|
241
|
+
message="Failed to delete ExecutionRequest",
|
|
242
|
+
context=str(self.instance_id),
|
|
243
|
+
exc=e,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
def _reached_runtime_limit(self) -> bool:
|
|
247
|
+
if (
|
|
248
|
+
self.config.runtime_limit_seconds
|
|
249
|
+
and self.report.ergc_start_time
|
|
250
|
+
and (
|
|
251
|
+
datetime.datetime.now() - self.report.ergc_start_time
|
|
252
|
+
>= datetime.timedelta(seconds=self.config.runtime_limit_seconds)
|
|
216
253
|
)
|
|
254
|
+
):
|
|
255
|
+
logger.info(f"ergc({self.instance_id}): max runtime reached.")
|
|
256
|
+
return True
|
|
257
|
+
return False
|
|
217
258
|
|
|
218
259
|
def run(self) -> None:
|
|
219
260
|
if not self.config.enabled:
|
|
@@ -221,6 +262,7 @@ class DatahubExecutionRequestCleanup:
|
|
|
221
262
|
f"ergc({self.instance_id}): ExecutionRequest cleaner is disabled."
|
|
222
263
|
)
|
|
223
264
|
return
|
|
265
|
+
self.report.ergc_start_time = datetime.datetime.now()
|
|
224
266
|
|
|
225
267
|
logger.info(
|
|
226
268
|
(
|
|
@@ -232,8 +274,11 @@ class DatahubExecutionRequestCleanup:
|
|
|
232
274
|
)
|
|
233
275
|
|
|
234
276
|
for entry in self._scroll_garbage_records():
|
|
277
|
+
if self._reached_runtime_limit():
|
|
278
|
+
break
|
|
235
279
|
self._delete_entry(entry)
|
|
236
280
|
|
|
281
|
+
self.report.ergc_end_time = datetime.datetime.now()
|
|
237
282
|
logger.info(
|
|
238
283
|
f"ergc({self.instance_id}): Finished cleanup of ExecutionRequest records."
|
|
239
284
|
)
|