acryl-datahub 0.15.0rc10__py3-none-any.whl → 0.15.0rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc10.dist-info → acryl_datahub-0.15.0rc11.dist-info}/METADATA +2572 -2536
- {acryl_datahub-0.15.0rc10.dist-info → acryl_datahub-0.15.0rc11.dist-info}/RECORD +10 -10
- datahub/__init__.py +1 -1
- datahub/cli/delete_cli.py +37 -4
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/ingestion/source/gc/dataprocess_cleanup.py +47 -23
- datahub/ingestion/source/pulsar.py +11 -1
- {acryl_datahub-0.15.0rc10.dist-info → acryl_datahub-0.15.0rc11.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc10.dist-info → acryl_datahub-0.15.0rc11.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0rc10.dist-info → acryl_datahub-0.15.0rc11.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=RtQFMiAfUcKAAt_1AITPz1UOKRx2gIW0yLbLYUEChqU,575
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -61,7 +61,7 @@ datahub/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
61
61
|
datahub/cli/check_cli.py,sha256=9dXNyzZayHeoFjwFjLkMVyx6DiCZfeESyI-sYtGA6bE,12850
|
|
62
62
|
datahub/cli/cli_utils.py,sha256=xMEK4tmoX2f_5BihxWrApOkayVLwhpPvMUU842x6FsI,13111
|
|
63
63
|
datahub/cli/config_utils.py,sha256=yuXw7RzpRY5x_-MAoqWbv46qUkIeRNAJL4_OeJpYdBE,4879
|
|
64
|
-
datahub/cli/delete_cli.py,sha256=
|
|
64
|
+
datahub/cli/delete_cli.py,sha256=Z7iXNr4ZMmghCnldU8laK4SwTNrhQEEnnUH_TeaBKog,21838
|
|
65
65
|
datahub/cli/docker_check.py,sha256=rED4wHXqxcQ_qNFyIgFEZ85BHT9ZTE5YC-oUKqbRqi0,9432
|
|
66
66
|
datahub/cli/docker_cli.py,sha256=QGoWFp8ZZsXOSMbgu0Q4snMmMmtP3epWAN-fYglUNEc,36491
|
|
67
67
|
datahub/cli/env_utils.py,sha256=RQzjg4JE29hjPt4v7p-RuqoOr99w8E3DBHWiN2Sm7T4,252
|
|
@@ -97,7 +97,7 @@ datahub/configuration/git.py,sha256=s55eUHxKqVZgtVsISaDyS-1F4iZBiybbjYsjbp5LU5o,
|
|
|
97
97
|
datahub/configuration/import_resolver.py,sha256=b4Ie9L7knN1LALEVMxTcNFSklDD6CVE-4Ipy4ZYhNYA,369
|
|
98
98
|
datahub/configuration/json_loader.py,sha256=vIDnjwXWi9yHDO8KW64EupOzOb_sspehGCD7xGHzg84,302
|
|
99
99
|
datahub/configuration/kafka.py,sha256=MlIwpd5FFyOyjdDXW_X9JTLNk7f988sPMgevkcZYVgI,2579
|
|
100
|
-
datahub/configuration/kafka_consumer_config.py,sha256=
|
|
100
|
+
datahub/configuration/kafka_consumer_config.py,sha256=LivsObTt9yC3WoGnslJbF_x4ojfNdxMIMEhb8vvJfcA,2133
|
|
101
101
|
datahub/configuration/pattern_utils.py,sha256=Q5IB9RfWOOo5FvRVBU7XkhiwHCxSQ1NTMfUlWtWI9qc,699
|
|
102
102
|
datahub/configuration/pydantic_migration_helpers.py,sha256=4C_COAVZ5iJ8yxcWNgXZNWsY7ULogICNZ368oNF7zWg,1462
|
|
103
103
|
datahub/configuration/source_common.py,sha256=68LZOuB23zSEcfgQJE1wZQnyYQHVVnEZK3Sniv_nEQs,2107
|
|
@@ -201,7 +201,7 @@ datahub/ingestion/source/nifi.py,sha256=ttsjZ9aRUvINmewvKFIQD8Rwa4jcl35WFG-F-jPG
|
|
|
201
201
|
datahub/ingestion/source/openapi.py,sha256=3ea2ORz1cuq4e7L2hSjxG9Cw3__pVoJ5UNYTJS3EnKU,17386
|
|
202
202
|
datahub/ingestion/source/openapi_parser.py,sha256=1_68wHWe_SzWYEyC1YVDw9vxoadKjW1yv8DecvyIhwY,13606
|
|
203
203
|
datahub/ingestion/source/preset.py,sha256=eq7h1qKs8nfSBVot1ofN-YgZhw_rzq8DG4cKOGfDHko,3948
|
|
204
|
-
datahub/ingestion/source/pulsar.py,sha256=
|
|
204
|
+
datahub/ingestion/source/pulsar.py,sha256=H8XJC7xIX8Kdkd7006PxllAGVO_Pjza8Xx9VUBOvpPc,19827
|
|
205
205
|
datahub/ingestion/source/redash.py,sha256=E-a14X19zppPun7_-S-pZ2lRiw1-68QiT-jL7bDzG10,32057
|
|
206
206
|
datahub/ingestion/source/salesforce.py,sha256=S6LSM6mzl8-zKbrJPoINhM1SCpYfM244Xb74pbEI-J0,31792
|
|
207
207
|
datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
|
|
@@ -301,7 +301,7 @@ datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP
|
|
|
301
301
|
datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
|
|
302
302
|
datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
303
303
|
datahub/ingestion/source/gc/datahub_gc.py,sha256=f6Erj3KfD0Hx3ydwL5MUVCZgFzS9c6U2Pkr54JLIUOA,12394
|
|
304
|
-
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=
|
|
304
|
+
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=GOdLVYYfmmHV7Xtr-cwCTiQiRSVwq80Nu0EmARyoEeo,15323
|
|
305
305
|
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=cHJmxz4NmA7VjTX2iGEo3wZ_SDrjC_rCQcnRxKgfUVI,8713
|
|
306
306
|
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=_tms5AqNAJRDRzQmyN_VydzXbdME2lkvTwa5u1La5z8,7353
|
|
307
307
|
datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -974,8 +974,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
974
974
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
975
975
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
976
976
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
977
|
-
acryl_datahub-0.15.
|
|
978
|
-
acryl_datahub-0.15.
|
|
979
|
-
acryl_datahub-0.15.
|
|
980
|
-
acryl_datahub-0.15.
|
|
981
|
-
acryl_datahub-0.15.
|
|
977
|
+
acryl_datahub-0.15.0rc11.dist-info/METADATA,sha256=0GnLLvLM_Fvdw4XNusJLRrDZ0vemeu_B0ftz8XQXmbs,174408
|
|
978
|
+
acryl_datahub-0.15.0rc11.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
979
|
+
acryl_datahub-0.15.0rc11.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
|
|
980
|
+
acryl_datahub-0.15.0rc11.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
981
|
+
acryl_datahub-0.15.0rc11.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
datahub/cli/delete_cli.py
CHANGED
|
@@ -214,14 +214,47 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
|
214
214
|
|
|
215
215
|
|
|
216
216
|
@delete.command()
|
|
217
|
-
@click.option("--urn", required=
|
|
218
|
-
|
|
217
|
+
@click.option("--urn", required=False, type=str, help="the urn of the entity")
|
|
218
|
+
@click.option(
|
|
219
|
+
"-p",
|
|
220
|
+
"--platform",
|
|
221
|
+
required=False,
|
|
222
|
+
type=str,
|
|
223
|
+
help="Platform filter (e.g. snowflake)",
|
|
224
|
+
)
|
|
225
|
+
@click.option(
|
|
226
|
+
"-b",
|
|
227
|
+
"--batch-size",
|
|
228
|
+
required=False,
|
|
229
|
+
default=3000,
|
|
230
|
+
type=int,
|
|
231
|
+
help="Batch size when querying for entities to un-soft delete."
|
|
232
|
+
"Maximum 10000. Large batch sizes may cause timeouts.",
|
|
233
|
+
)
|
|
234
|
+
def undo_by_filter(
|
|
235
|
+
urn: Optional[str], platform: Optional[str], batch_size: int
|
|
236
|
+
) -> None:
|
|
219
237
|
"""
|
|
220
|
-
Undo
|
|
238
|
+
Undo soft deletion by filters
|
|
221
239
|
"""
|
|
222
240
|
graph = get_default_graph()
|
|
223
241
|
logger.info(f"Using {graph}")
|
|
224
|
-
|
|
242
|
+
if urn:
|
|
243
|
+
graph.set_soft_delete_status(urn=urn, delete=False)
|
|
244
|
+
else:
|
|
245
|
+
urns = list(
|
|
246
|
+
graph.get_urns_by_filter(
|
|
247
|
+
platform=platform,
|
|
248
|
+
query="*",
|
|
249
|
+
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
250
|
+
batch_size=batch_size,
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
logger.info(f"Going to un-soft delete {len(urns)} urns")
|
|
254
|
+
urns_iter = progressbar.progressbar(urns, redirect_stdout=True)
|
|
255
|
+
for urn in urns_iter:
|
|
256
|
+
assert urn
|
|
257
|
+
graph.set_soft_delete_status(urn=urn, delete=False)
|
|
225
258
|
|
|
226
259
|
|
|
227
260
|
@delete.command(no_args_is_help=True)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
2
3
|
from typing import Any, Dict, Optional
|
|
3
4
|
|
|
@@ -34,5 +35,34 @@ class CallableConsumerConfig:
|
|
|
34
35
|
"oauth_cb must be a string representing python function reference "
|
|
35
36
|
"in the format <python-module>:<function-name>."
|
|
36
37
|
)
|
|
38
|
+
|
|
39
|
+
call_back_fn = import_path(call_back)
|
|
40
|
+
self._validate_call_back_fn_signature(call_back_fn)
|
|
41
|
+
|
|
37
42
|
# Set the callback
|
|
38
|
-
self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] =
|
|
43
|
+
self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = call_back_fn
|
|
44
|
+
|
|
45
|
+
def _validate_call_back_fn_signature(self, call_back_fn: Any) -> None:
|
|
46
|
+
sig = inspect.signature(call_back_fn)
|
|
47
|
+
|
|
48
|
+
num_positional_args = len(
|
|
49
|
+
[
|
|
50
|
+
param
|
|
51
|
+
for param in sig.parameters.values()
|
|
52
|
+
if param.kind
|
|
53
|
+
in (
|
|
54
|
+
inspect.Parameter.POSITIONAL_ONLY,
|
|
55
|
+
inspect.Parameter.POSITIONAL_OR_KEYWORD,
|
|
56
|
+
)
|
|
57
|
+
and param.default == inspect.Parameter.empty
|
|
58
|
+
]
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
has_variadic_args = any(
|
|
62
|
+
param.kind == inspect.Parameter.VAR_POSITIONAL
|
|
63
|
+
for param in sig.parameters.values()
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
assert num_positional_args == 1 or (
|
|
67
|
+
has_variadic_args and num_positional_args <= 1
|
|
68
|
+
), "oauth_cb function must accept single positional argument."
|
|
@@ -208,22 +208,28 @@ class DataProcessCleanup:
|
|
|
208
208
|
dpis = []
|
|
209
209
|
start = 0
|
|
210
210
|
while True:
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
211
|
+
try:
|
|
212
|
+
job_query_result = self.ctx.graph.execute_graphql(
|
|
213
|
+
DATA_PROCESS_INSTANCES_QUERY,
|
|
214
|
+
{"dataJobUrn": job_urn, "start": start, "count": batch_size},
|
|
215
|
+
)
|
|
216
|
+
job_data = job_query_result.get("dataJob")
|
|
217
|
+
if not job_data:
|
|
218
|
+
logger.error(f"Error getting job {job_urn}")
|
|
219
|
+
break
|
|
220
|
+
|
|
221
|
+
runs_data = job_data.get("runs")
|
|
222
|
+
if not runs_data:
|
|
223
|
+
logger.error(f"Error getting runs for {job_urn}")
|
|
224
|
+
break
|
|
225
|
+
|
|
226
|
+
runs = runs_data.get("runs")
|
|
227
|
+
dpis.extend(runs)
|
|
228
|
+
start += batch_size
|
|
229
|
+
if len(runs) < batch_size:
|
|
230
|
+
break
|
|
231
|
+
except Exception as e:
|
|
232
|
+
logger.error(f"Exception while fetching DPIs for job {job_urn}: {e}")
|
|
227
233
|
break
|
|
228
234
|
return dpis
|
|
229
235
|
|
|
@@ -243,8 +249,12 @@ class DataProcessCleanup:
|
|
|
243
249
|
futures[future] = dpi
|
|
244
250
|
|
|
245
251
|
for future in as_completed(futures):
|
|
246
|
-
|
|
247
|
-
|
|
252
|
+
try:
|
|
253
|
+
future.result()
|
|
254
|
+
deleted_count_last_n += 1
|
|
255
|
+
futures[future]["deleted"] = True
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.error(f"Exception while deleting DPI: {e}")
|
|
248
258
|
|
|
249
259
|
if deleted_count_last_n % self.config.batch_size == 0:
|
|
250
260
|
logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
|
|
@@ -279,7 +289,7 @@ class DataProcessCleanup:
|
|
|
279
289
|
dpis = self.fetch_dpis(job.urn, self.config.batch_size)
|
|
280
290
|
dpis.sort(
|
|
281
291
|
key=lambda x: x["created"]["time"]
|
|
282
|
-
if
|
|
292
|
+
if "created" in x and "time" in x["created"]
|
|
283
293
|
else 0,
|
|
284
294
|
reverse=True,
|
|
285
295
|
)
|
|
@@ -314,15 +324,23 @@ class DataProcessCleanup:
|
|
|
314
324
|
if dpi.get("deleted"):
|
|
315
325
|
continue
|
|
316
326
|
|
|
317
|
-
if
|
|
327
|
+
if (
|
|
328
|
+
"created" not in dpi
|
|
329
|
+
or "time" not in dpi["created"]
|
|
330
|
+
or dpi["created"]["time"] < retention_time * 1000
|
|
331
|
+
):
|
|
318
332
|
future = executor.submit(
|
|
319
333
|
self.delete_entity, dpi["urn"], "dataprocessInstance"
|
|
320
334
|
)
|
|
321
335
|
futures[future] = dpi
|
|
322
336
|
|
|
323
337
|
for future in as_completed(futures):
|
|
324
|
-
|
|
325
|
-
|
|
338
|
+
try:
|
|
339
|
+
future.result()
|
|
340
|
+
deleted_count_retention += 1
|
|
341
|
+
futures[future]["deleted"] = True
|
|
342
|
+
except Exception as e:
|
|
343
|
+
logger.error(f"Exception while deleting DPI: {e}")
|
|
326
344
|
|
|
327
345
|
if deleted_count_retention % self.config.batch_size == 0:
|
|
328
346
|
logger.info(
|
|
@@ -378,8 +396,11 @@ class DataProcessCleanup:
|
|
|
378
396
|
dataFlows[flow.urn] = flow
|
|
379
397
|
|
|
380
398
|
scroll_id: Optional[str] = None
|
|
399
|
+
previous_scroll_id: Optional[str] = None
|
|
400
|
+
|
|
381
401
|
dataJobs: Dict[str, List[DataJobEntity]] = defaultdict(list)
|
|
382
402
|
deleted_jobs: int = 0
|
|
403
|
+
|
|
383
404
|
while True:
|
|
384
405
|
result = self.ctx.graph.execute_graphql(
|
|
385
406
|
DATAJOB_QUERY,
|
|
@@ -426,9 +447,11 @@ class DataProcessCleanup:
|
|
|
426
447
|
else:
|
|
427
448
|
dataJobs[datajob_entity.flow_urn].append(datajob_entity)
|
|
428
449
|
|
|
429
|
-
if not scroll_id:
|
|
450
|
+
if not scroll_id or previous_scroll_id == scroll_id:
|
|
430
451
|
break
|
|
431
452
|
|
|
453
|
+
previous_scroll_id = scroll_id
|
|
454
|
+
|
|
432
455
|
logger.info(f"Deleted {deleted_jobs} DataJobs")
|
|
433
456
|
# Delete empty dataflows if needed
|
|
434
457
|
if self.config.delete_empty_data_flows:
|
|
@@ -443,4 +466,5 @@ class DataProcessCleanup:
|
|
|
443
466
|
if deleted_jobs % self.config.batch_size == 0:
|
|
444
467
|
logger.info(f"Deleted {deleted_data_flows} DataFlows")
|
|
445
468
|
logger.info(f"Deleted {deleted_data_flows} DataFlows")
|
|
469
|
+
|
|
446
470
|
return []
|
|
@@ -78,7 +78,17 @@ class PulsarSchema:
|
|
|
78
78
|
def __init__(self, schema):
|
|
79
79
|
self.schema_version = schema.get("version")
|
|
80
80
|
|
|
81
|
-
|
|
81
|
+
schema_data = schema.get("data")
|
|
82
|
+
if not schema_data:
|
|
83
|
+
logger.warning("Schema data is empty or None. Using default empty schema.")
|
|
84
|
+
schema_data = "{}"
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
avro_schema = json.loads(schema_data)
|
|
88
|
+
except json.JSONDecodeError as e:
|
|
89
|
+
logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}")
|
|
90
|
+
avro_schema = {}
|
|
91
|
+
|
|
82
92
|
self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name")
|
|
83
93
|
self.schema_description = avro_schema.get("doc")
|
|
84
94
|
self.schema_type = schema.get("type")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|