acryl-datahub 0.15.0rc9__py3-none-any.whl → 0.15.0rc11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc9.dist-info → acryl_datahub-0.15.0rc11.dist-info}/METADATA +2492 -2456
- {acryl_datahub-0.15.0rc9.dist-info → acryl_datahub-0.15.0rc11.dist-info}/RECORD +12 -12
- datahub/__init__.py +1 -1
- datahub/cli/delete_cli.py +37 -4
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/ingestion/source/gc/dataprocess_cleanup.py +47 -23
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/pulsar.py +11 -1
- datahub/ingestion/source/tableau/tableau.py +53 -27
- {acryl_datahub-0.15.0rc9.dist-info → acryl_datahub-0.15.0rc11.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc9.dist-info → acryl_datahub-0.15.0rc11.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0rc9.dist-info → acryl_datahub-0.15.0rc11.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256
|
|
1
|
+
datahub/__init__.py,sha256=RtQFMiAfUcKAAt_1AITPz1UOKRx2gIW0yLbLYUEChqU,575
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -61,7 +61,7 @@ datahub/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
61
61
|
datahub/cli/check_cli.py,sha256=9dXNyzZayHeoFjwFjLkMVyx6DiCZfeESyI-sYtGA6bE,12850
|
|
62
62
|
datahub/cli/cli_utils.py,sha256=xMEK4tmoX2f_5BihxWrApOkayVLwhpPvMUU842x6FsI,13111
|
|
63
63
|
datahub/cli/config_utils.py,sha256=yuXw7RzpRY5x_-MAoqWbv46qUkIeRNAJL4_OeJpYdBE,4879
|
|
64
|
-
datahub/cli/delete_cli.py,sha256=
|
|
64
|
+
datahub/cli/delete_cli.py,sha256=Z7iXNr4ZMmghCnldU8laK4SwTNrhQEEnnUH_TeaBKog,21838
|
|
65
65
|
datahub/cli/docker_check.py,sha256=rED4wHXqxcQ_qNFyIgFEZ85BHT9ZTE5YC-oUKqbRqi0,9432
|
|
66
66
|
datahub/cli/docker_cli.py,sha256=QGoWFp8ZZsXOSMbgu0Q4snMmMmtP3epWAN-fYglUNEc,36491
|
|
67
67
|
datahub/cli/env_utils.py,sha256=RQzjg4JE29hjPt4v7p-RuqoOr99w8E3DBHWiN2Sm7T4,252
|
|
@@ -97,7 +97,7 @@ datahub/configuration/git.py,sha256=s55eUHxKqVZgtVsISaDyS-1F4iZBiybbjYsjbp5LU5o,
|
|
|
97
97
|
datahub/configuration/import_resolver.py,sha256=b4Ie9L7knN1LALEVMxTcNFSklDD6CVE-4Ipy4ZYhNYA,369
|
|
98
98
|
datahub/configuration/json_loader.py,sha256=vIDnjwXWi9yHDO8KW64EupOzOb_sspehGCD7xGHzg84,302
|
|
99
99
|
datahub/configuration/kafka.py,sha256=MlIwpd5FFyOyjdDXW_X9JTLNk7f988sPMgevkcZYVgI,2579
|
|
100
|
-
datahub/configuration/kafka_consumer_config.py,sha256=
|
|
100
|
+
datahub/configuration/kafka_consumer_config.py,sha256=LivsObTt9yC3WoGnslJbF_x4ojfNdxMIMEhb8vvJfcA,2133
|
|
101
101
|
datahub/configuration/pattern_utils.py,sha256=Q5IB9RfWOOo5FvRVBU7XkhiwHCxSQ1NTMfUlWtWI9qc,699
|
|
102
102
|
datahub/configuration/pydantic_migration_helpers.py,sha256=4C_COAVZ5iJ8yxcWNgXZNWsY7ULogICNZ368oNF7zWg,1462
|
|
103
103
|
datahub/configuration/source_common.py,sha256=68LZOuB23zSEcfgQJE1wZQnyYQHVVnEZK3Sniv_nEQs,2107
|
|
@@ -201,7 +201,7 @@ datahub/ingestion/source/nifi.py,sha256=ttsjZ9aRUvINmewvKFIQD8Rwa4jcl35WFG-F-jPG
|
|
|
201
201
|
datahub/ingestion/source/openapi.py,sha256=3ea2ORz1cuq4e7L2hSjxG9Cw3__pVoJ5UNYTJS3EnKU,17386
|
|
202
202
|
datahub/ingestion/source/openapi_parser.py,sha256=1_68wHWe_SzWYEyC1YVDw9vxoadKjW1yv8DecvyIhwY,13606
|
|
203
203
|
datahub/ingestion/source/preset.py,sha256=eq7h1qKs8nfSBVot1ofN-YgZhw_rzq8DG4cKOGfDHko,3948
|
|
204
|
-
datahub/ingestion/source/pulsar.py,sha256=
|
|
204
|
+
datahub/ingestion/source/pulsar.py,sha256=H8XJC7xIX8Kdkd7006PxllAGVO_Pjza8Xx9VUBOvpPc,19827
|
|
205
205
|
datahub/ingestion/source/redash.py,sha256=E-a14X19zppPun7_-S-pZ2lRiw1-68QiT-jL7bDzG10,32057
|
|
206
206
|
datahub/ingestion/source/salesforce.py,sha256=S6LSM6mzl8-zKbrJPoINhM1SCpYfM244Xb74pbEI-J0,31792
|
|
207
207
|
datahub/ingestion/source/source_registry.py,sha256=a2mLjJPLkSI-gYCTb_7U7Jo4D8jGknNQ_yScPIihXFk,1208
|
|
@@ -301,7 +301,7 @@ datahub/ingestion/source/fivetran/fivetran_log_api.py,sha256=EAak3hJpe75WZSgz6wP
|
|
|
301
301
|
datahub/ingestion/source/fivetran/fivetran_query.py,sha256=vLrTj7e-0NxZ2U4bWTB57pih42WirqPlUvwtIRfStlQ,5275
|
|
302
302
|
datahub/ingestion/source/gc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
303
303
|
datahub/ingestion/source/gc/datahub_gc.py,sha256=f6Erj3KfD0Hx3ydwL5MUVCZgFzS9c6U2Pkr54JLIUOA,12394
|
|
304
|
-
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=
|
|
304
|
+
datahub/ingestion/source/gc/dataprocess_cleanup.py,sha256=GOdLVYYfmmHV7Xtr-cwCTiQiRSVwq80Nu0EmARyoEeo,15323
|
|
305
305
|
datahub/ingestion/source/gc/execution_request_cleanup.py,sha256=cHJmxz4NmA7VjTX2iGEo3wZ_SDrjC_rCQcnRxKgfUVI,8713
|
|
306
306
|
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py,sha256=_tms5AqNAJRDRzQmyN_VydzXbdME2lkvTwa5u1La5z8,7353
|
|
307
307
|
datahub/ingestion/source/gcs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -331,7 +331,7 @@ datahub/ingestion/source/looker/looker_constant.py,sha256=GMKYtNXlpojPxa9azridKf
|
|
|
331
331
|
datahub/ingestion/source/looker/looker_dataclasses.py,sha256=ULWLFWsV2cKmTuOFavD8QjEBmnXmvjyr8RbUB62DwJQ,12178
|
|
332
332
|
datahub/ingestion/source/looker/looker_file_loader.py,sha256=c1ewDrIb9VJg1o-asbwX9gL83kgL01vIETzzbmZIhmw,4267
|
|
333
333
|
datahub/ingestion/source/looker/looker_lib_wrapper.py,sha256=QTTCW-rPNUoazQG_sTJbCARXJzQ7NKS-XKURp2AAWls,11106
|
|
334
|
-
datahub/ingestion/source/looker/looker_liquid_tag.py,sha256=
|
|
334
|
+
datahub/ingestion/source/looker/looker_liquid_tag.py,sha256=mO4G4MNA4YZFvZaDBpdiJ2vP3irC82kY34RdaK4Pbfs,3100
|
|
335
335
|
datahub/ingestion/source/looker/looker_query_model.py,sha256=N0jBbFruiCIIGT6sJn6tNeppeQ78KGTkOwTLirhxFNc,2144
|
|
336
336
|
datahub/ingestion/source/looker/looker_source.py,sha256=AByQxWVfOBqOtZPaR_cw9SB-tFZtfppiKRkFSbcK1GA,65346
|
|
337
337
|
datahub/ingestion/source/looker/looker_template_language.py,sha256=EG4ZfVZ0x53lgaYh2ohzL4ZCy9KsX0TA51XqCmsCd2Q,14328
|
|
@@ -485,7 +485,7 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
|
|
|
485
485
|
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
|
|
486
486
|
datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
|
|
487
487
|
datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
488
|
-
datahub/ingestion/source/tableau/tableau.py,sha256=
|
|
488
|
+
datahub/ingestion/source/tableau/tableau.py,sha256=AFlDng8EfvBvZL692hMf_sfzGwpHpUU6FW_ElR4uitQ,131551
|
|
489
489
|
datahub/ingestion/source/tableau/tableau_common.py,sha256=Dy_2pvkPucZJsG_LvQZLlxNEkjh-yOXHlZ4jurq9opM,26069
|
|
490
490
|
datahub/ingestion/source/tableau/tableau_constant.py,sha256=nWElhtDo5kj5mWivZFmtVF_4Ugw0-EatBYWyDVzu5hE,2501
|
|
491
491
|
datahub/ingestion/source/unity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -974,8 +974,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
974
974
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
975
975
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
976
976
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
977
|
-
acryl_datahub-0.15.
|
|
978
|
-
acryl_datahub-0.15.
|
|
979
|
-
acryl_datahub-0.15.
|
|
980
|
-
acryl_datahub-0.15.
|
|
981
|
-
acryl_datahub-0.15.
|
|
977
|
+
acryl_datahub-0.15.0rc11.dist-info/METADATA,sha256=0GnLLvLM_Fvdw4XNusJLRrDZ0vemeu_B0ftz8XQXmbs,174408
|
|
978
|
+
acryl_datahub-0.15.0rc11.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
979
|
+
acryl_datahub-0.15.0rc11.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
|
|
980
|
+
acryl_datahub-0.15.0rc11.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
981
|
+
acryl_datahub-0.15.0rc11.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
datahub/cli/delete_cli.py
CHANGED
|
@@ -214,14 +214,47 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
|
214
214
|
|
|
215
215
|
|
|
216
216
|
@delete.command()
|
|
217
|
-
@click.option("--urn", required=
|
|
218
|
-
|
|
217
|
+
@click.option("--urn", required=False, type=str, help="the urn of the entity")
|
|
218
|
+
@click.option(
|
|
219
|
+
"-p",
|
|
220
|
+
"--platform",
|
|
221
|
+
required=False,
|
|
222
|
+
type=str,
|
|
223
|
+
help="Platform filter (e.g. snowflake)",
|
|
224
|
+
)
|
|
225
|
+
@click.option(
|
|
226
|
+
"-b",
|
|
227
|
+
"--batch-size",
|
|
228
|
+
required=False,
|
|
229
|
+
default=3000,
|
|
230
|
+
type=int,
|
|
231
|
+
help="Batch size when querying for entities to un-soft delete."
|
|
232
|
+
"Maximum 10000. Large batch sizes may cause timeouts.",
|
|
233
|
+
)
|
|
234
|
+
def undo_by_filter(
|
|
235
|
+
urn: Optional[str], platform: Optional[str], batch_size: int
|
|
236
|
+
) -> None:
|
|
219
237
|
"""
|
|
220
|
-
Undo
|
|
238
|
+
Undo soft deletion by filters
|
|
221
239
|
"""
|
|
222
240
|
graph = get_default_graph()
|
|
223
241
|
logger.info(f"Using {graph}")
|
|
224
|
-
|
|
242
|
+
if urn:
|
|
243
|
+
graph.set_soft_delete_status(urn=urn, delete=False)
|
|
244
|
+
else:
|
|
245
|
+
urns = list(
|
|
246
|
+
graph.get_urns_by_filter(
|
|
247
|
+
platform=platform,
|
|
248
|
+
query="*",
|
|
249
|
+
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
250
|
+
batch_size=batch_size,
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
logger.info(f"Going to un-soft delete {len(urns)} urns")
|
|
254
|
+
urns_iter = progressbar.progressbar(urns, redirect_stdout=True)
|
|
255
|
+
for urn in urns_iter:
|
|
256
|
+
assert urn
|
|
257
|
+
graph.set_soft_delete_status(urn=urn, delete=False)
|
|
225
258
|
|
|
226
259
|
|
|
227
260
|
@delete.command(no_args_is_help=True)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
2
3
|
from typing import Any, Dict, Optional
|
|
3
4
|
|
|
@@ -34,5 +35,34 @@ class CallableConsumerConfig:
|
|
|
34
35
|
"oauth_cb must be a string representing python function reference "
|
|
35
36
|
"in the format <python-module>:<function-name>."
|
|
36
37
|
)
|
|
38
|
+
|
|
39
|
+
call_back_fn = import_path(call_back)
|
|
40
|
+
self._validate_call_back_fn_signature(call_back_fn)
|
|
41
|
+
|
|
37
42
|
# Set the callback
|
|
38
|
-
self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] =
|
|
43
|
+
self._config[CallableConsumerConfig.CALLBACK_ATTRIBUTE] = call_back_fn
|
|
44
|
+
|
|
45
|
+
def _validate_call_back_fn_signature(self, call_back_fn: Any) -> None:
|
|
46
|
+
sig = inspect.signature(call_back_fn)
|
|
47
|
+
|
|
48
|
+
num_positional_args = len(
|
|
49
|
+
[
|
|
50
|
+
param
|
|
51
|
+
for param in sig.parameters.values()
|
|
52
|
+
if param.kind
|
|
53
|
+
in (
|
|
54
|
+
inspect.Parameter.POSITIONAL_ONLY,
|
|
55
|
+
inspect.Parameter.POSITIONAL_OR_KEYWORD,
|
|
56
|
+
)
|
|
57
|
+
and param.default == inspect.Parameter.empty
|
|
58
|
+
]
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
has_variadic_args = any(
|
|
62
|
+
param.kind == inspect.Parameter.VAR_POSITIONAL
|
|
63
|
+
for param in sig.parameters.values()
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
assert num_positional_args == 1 or (
|
|
67
|
+
has_variadic_args and num_positional_args <= 1
|
|
68
|
+
), "oauth_cb function must accept single positional argument."
|
|
@@ -208,22 +208,28 @@ class DataProcessCleanup:
|
|
|
208
208
|
dpis = []
|
|
209
209
|
start = 0
|
|
210
210
|
while True:
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
211
|
+
try:
|
|
212
|
+
job_query_result = self.ctx.graph.execute_graphql(
|
|
213
|
+
DATA_PROCESS_INSTANCES_QUERY,
|
|
214
|
+
{"dataJobUrn": job_urn, "start": start, "count": batch_size},
|
|
215
|
+
)
|
|
216
|
+
job_data = job_query_result.get("dataJob")
|
|
217
|
+
if not job_data:
|
|
218
|
+
logger.error(f"Error getting job {job_urn}")
|
|
219
|
+
break
|
|
220
|
+
|
|
221
|
+
runs_data = job_data.get("runs")
|
|
222
|
+
if not runs_data:
|
|
223
|
+
logger.error(f"Error getting runs for {job_urn}")
|
|
224
|
+
break
|
|
225
|
+
|
|
226
|
+
runs = runs_data.get("runs")
|
|
227
|
+
dpis.extend(runs)
|
|
228
|
+
start += batch_size
|
|
229
|
+
if len(runs) < batch_size:
|
|
230
|
+
break
|
|
231
|
+
except Exception as e:
|
|
232
|
+
logger.error(f"Exception while fetching DPIs for job {job_urn}: {e}")
|
|
227
233
|
break
|
|
228
234
|
return dpis
|
|
229
235
|
|
|
@@ -243,8 +249,12 @@ class DataProcessCleanup:
|
|
|
243
249
|
futures[future] = dpi
|
|
244
250
|
|
|
245
251
|
for future in as_completed(futures):
|
|
246
|
-
|
|
247
|
-
|
|
252
|
+
try:
|
|
253
|
+
future.result()
|
|
254
|
+
deleted_count_last_n += 1
|
|
255
|
+
futures[future]["deleted"] = True
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.error(f"Exception while deleting DPI: {e}")
|
|
248
258
|
|
|
249
259
|
if deleted_count_last_n % self.config.batch_size == 0:
|
|
250
260
|
logger.info(f"Deleted {deleted_count_last_n} DPIs from {job.urn}")
|
|
@@ -279,7 +289,7 @@ class DataProcessCleanup:
|
|
|
279
289
|
dpis = self.fetch_dpis(job.urn, self.config.batch_size)
|
|
280
290
|
dpis.sort(
|
|
281
291
|
key=lambda x: x["created"]["time"]
|
|
282
|
-
if
|
|
292
|
+
if "created" in x and "time" in x["created"]
|
|
283
293
|
else 0,
|
|
284
294
|
reverse=True,
|
|
285
295
|
)
|
|
@@ -314,15 +324,23 @@ class DataProcessCleanup:
|
|
|
314
324
|
if dpi.get("deleted"):
|
|
315
325
|
continue
|
|
316
326
|
|
|
317
|
-
if
|
|
327
|
+
if (
|
|
328
|
+
"created" not in dpi
|
|
329
|
+
or "time" not in dpi["created"]
|
|
330
|
+
or dpi["created"]["time"] < retention_time * 1000
|
|
331
|
+
):
|
|
318
332
|
future = executor.submit(
|
|
319
333
|
self.delete_entity, dpi["urn"], "dataprocessInstance"
|
|
320
334
|
)
|
|
321
335
|
futures[future] = dpi
|
|
322
336
|
|
|
323
337
|
for future in as_completed(futures):
|
|
324
|
-
|
|
325
|
-
|
|
338
|
+
try:
|
|
339
|
+
future.result()
|
|
340
|
+
deleted_count_retention += 1
|
|
341
|
+
futures[future]["deleted"] = True
|
|
342
|
+
except Exception as e:
|
|
343
|
+
logger.error(f"Exception while deleting DPI: {e}")
|
|
326
344
|
|
|
327
345
|
if deleted_count_retention % self.config.batch_size == 0:
|
|
328
346
|
logger.info(
|
|
@@ -378,8 +396,11 @@ class DataProcessCleanup:
|
|
|
378
396
|
dataFlows[flow.urn] = flow
|
|
379
397
|
|
|
380
398
|
scroll_id: Optional[str] = None
|
|
399
|
+
previous_scroll_id: Optional[str] = None
|
|
400
|
+
|
|
381
401
|
dataJobs: Dict[str, List[DataJobEntity]] = defaultdict(list)
|
|
382
402
|
deleted_jobs: int = 0
|
|
403
|
+
|
|
383
404
|
while True:
|
|
384
405
|
result = self.ctx.graph.execute_graphql(
|
|
385
406
|
DATAJOB_QUERY,
|
|
@@ -426,9 +447,11 @@ class DataProcessCleanup:
|
|
|
426
447
|
else:
|
|
427
448
|
dataJobs[datajob_entity.flow_urn].append(datajob_entity)
|
|
428
449
|
|
|
429
|
-
if not scroll_id:
|
|
450
|
+
if not scroll_id or previous_scroll_id == scroll_id:
|
|
430
451
|
break
|
|
431
452
|
|
|
453
|
+
previous_scroll_id = scroll_id
|
|
454
|
+
|
|
432
455
|
logger.info(f"Deleted {deleted_jobs} DataJobs")
|
|
433
456
|
# Delete empty dataflows if needed
|
|
434
457
|
if self.config.delete_empty_data_flows:
|
|
@@ -443,4 +466,5 @@ class DataProcessCleanup:
|
|
|
443
466
|
if deleted_jobs % self.config.batch_size == 0:
|
|
444
467
|
logger.info(f"Deleted {deleted_data_flows} DataFlows")
|
|
445
468
|
logger.info(f"Deleted {deleted_data_flows} DataFlows")
|
|
469
|
+
|
|
446
470
|
return []
|
|
@@ -4,6 +4,7 @@ from typing import ClassVar, Optional, TextIO
|
|
|
4
4
|
from liquid import Environment
|
|
5
5
|
from liquid.ast import Node
|
|
6
6
|
from liquid.context import Context
|
|
7
|
+
from liquid.filter import string_filter
|
|
7
8
|
from liquid.parse import expect, get_parser
|
|
8
9
|
from liquid.stream import TokenStream
|
|
9
10
|
from liquid.tag import Tag
|
|
@@ -81,12 +82,18 @@ class ConditionTag(Tag):
|
|
|
81
82
|
custom_tags = [ConditionTag]
|
|
82
83
|
|
|
83
84
|
|
|
85
|
+
@string_filter
|
|
86
|
+
def sql_quote_filter(variable: str) -> str:
|
|
87
|
+
return f"'{variable}'"
|
|
88
|
+
|
|
89
|
+
|
|
84
90
|
@lru_cache(maxsize=1)
|
|
85
91
|
def _create_env() -> Environment:
|
|
86
|
-
env: Environment = Environment()
|
|
92
|
+
env: Environment = Environment(strict_filters=False)
|
|
87
93
|
# register tag. One time activity
|
|
88
94
|
for custom_tag in custom_tags:
|
|
89
95
|
env.add_tag(custom_tag)
|
|
96
|
+
env.add_filter("sql_quote", sql_quote_filter)
|
|
90
97
|
return env
|
|
91
98
|
|
|
92
99
|
|
|
@@ -78,7 +78,17 @@ class PulsarSchema:
|
|
|
78
78
|
def __init__(self, schema):
|
|
79
79
|
self.schema_version = schema.get("version")
|
|
80
80
|
|
|
81
|
-
|
|
81
|
+
schema_data = schema.get("data")
|
|
82
|
+
if not schema_data:
|
|
83
|
+
logger.warning("Schema data is empty or None. Using default empty schema.")
|
|
84
|
+
schema_data = "{}"
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
avro_schema = json.loads(schema_data)
|
|
88
|
+
except json.JSONDecodeError as e:
|
|
89
|
+
logger.error(f"Invalid JSON schema: {schema_data}. Error: {str(e)}")
|
|
90
|
+
avro_schema = {}
|
|
91
|
+
|
|
82
92
|
self.schema_name = avro_schema.get("namespace") + "." + avro_schema.get("name")
|
|
83
93
|
self.schema_description = avro_schema.get("doc")
|
|
84
94
|
self.schema_type = schema.get("type")
|
|
@@ -68,6 +68,7 @@ from datahub.ingestion.api.source import (
|
|
|
68
68
|
CapabilityReport,
|
|
69
69
|
MetadataWorkUnitProcessor,
|
|
70
70
|
Source,
|
|
71
|
+
StructuredLogLevel,
|
|
71
72
|
TestableSource,
|
|
72
73
|
TestConnectionReport,
|
|
73
74
|
)
|
|
@@ -289,16 +290,12 @@ class TableauConnectionConfig(ConfigModel):
|
|
|
289
290
|
server.auth.sign_in(authentication)
|
|
290
291
|
return server
|
|
291
292
|
except ServerResponseError as e:
|
|
293
|
+
message = f"Unable to login (invalid/expired credentials or missing permissions): {str(e)}"
|
|
292
294
|
if isinstance(authentication, PersonalAccessTokenAuth):
|
|
293
295
|
# Docs on token expiry in Tableau:
|
|
294
296
|
# https://help.tableau.com/current/server/en-us/security_personal_access_tokens.htm#token-expiry
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
"expire if not used for 15 days or if over 1 year old"
|
|
298
|
-
)
|
|
299
|
-
raise ValueError(
|
|
300
|
-
f"Unable to login (invalid/expired credentials or missing permissions): {str(e)}"
|
|
301
|
-
) from e
|
|
297
|
+
message = f"Error authenticating with Tableau. Note that Tableau personal access tokens expire if not used for 15 days or if over 1 year old: {str(e)}"
|
|
298
|
+
raise ValueError(message) from e
|
|
302
299
|
except Exception as e:
|
|
303
300
|
raise ValueError(
|
|
304
301
|
f"Unable to login (check your Tableau connection and credentials): {str(e)}"
|
|
@@ -700,6 +697,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
700
697
|
config=self.config,
|
|
701
698
|
ctx=self.ctx,
|
|
702
699
|
site=site,
|
|
700
|
+
site_id=site.id,
|
|
703
701
|
report=self.report,
|
|
704
702
|
server=self.server,
|
|
705
703
|
platform=self.platform,
|
|
@@ -707,11 +705,19 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
707
705
|
logger.info(f"Ingesting assets of site '{site.content_url}'.")
|
|
708
706
|
yield from site_source.ingest_tableau_site()
|
|
709
707
|
else:
|
|
710
|
-
site =
|
|
708
|
+
site = None
|
|
709
|
+
with self.report.report_exc(
|
|
710
|
+
title="Unable to fetch site details. Site hierarchy may be incomplete and external urls may be missing.",
|
|
711
|
+
message="This usually indicates missing permissions. Ensure that you have all necessary permissions.",
|
|
712
|
+
level=StructuredLogLevel.WARN,
|
|
713
|
+
):
|
|
714
|
+
site = self.server.sites.get_by_id(self.server.site_id)
|
|
715
|
+
|
|
711
716
|
site_source = TableauSiteSource(
|
|
712
717
|
config=self.config,
|
|
713
718
|
ctx=self.ctx,
|
|
714
719
|
site=site,
|
|
720
|
+
site_id=self.server.site_id,
|
|
715
721
|
report=self.report,
|
|
716
722
|
server=self.server,
|
|
717
723
|
platform=self.platform,
|
|
@@ -722,6 +728,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
722
728
|
title="Failed to Retrieve Tableau Metadata",
|
|
723
729
|
message="Unable to retrieve metadata from tableau.",
|
|
724
730
|
context=str(md_exception),
|
|
731
|
+
exc=md_exception,
|
|
725
732
|
)
|
|
726
733
|
|
|
727
734
|
def close(self) -> None:
|
|
@@ -743,7 +750,8 @@ class TableauSiteSource:
|
|
|
743
750
|
self,
|
|
744
751
|
config: TableauConfig,
|
|
745
752
|
ctx: PipelineContext,
|
|
746
|
-
site: SiteItem,
|
|
753
|
+
site: Optional[SiteItem],
|
|
754
|
+
site_id: Optional[str],
|
|
747
755
|
report: TableauSourceReport,
|
|
748
756
|
server: Server,
|
|
749
757
|
platform: str,
|
|
@@ -752,9 +760,16 @@ class TableauSiteSource:
|
|
|
752
760
|
self.report = report
|
|
753
761
|
self.server: Server = server
|
|
754
762
|
self.ctx: PipelineContext = ctx
|
|
755
|
-
self.site: SiteItem = site
|
|
756
763
|
self.platform = platform
|
|
757
764
|
|
|
765
|
+
self.site: Optional[SiteItem] = site
|
|
766
|
+
if site_id is not None:
|
|
767
|
+
self.site_id: str = site_id
|
|
768
|
+
else:
|
|
769
|
+
assert self.site is not None, "site or site_id is required"
|
|
770
|
+
assert self.site.id is not None, "site_id is required when site is provided"
|
|
771
|
+
self.site_id = self.site.id
|
|
772
|
+
|
|
758
773
|
self.database_tables: Dict[str, DatabaseTable] = {}
|
|
759
774
|
self.tableau_stat_registry: Dict[str, UsageStat] = {}
|
|
760
775
|
self.tableau_project_registry: Dict[str, TableauProject] = {}
|
|
@@ -808,7 +823,7 @@ class TableauSiteSource:
|
|
|
808
823
|
def _re_authenticate(self):
|
|
809
824
|
tableau_auth: Union[
|
|
810
825
|
TableauAuth, PersonalAccessTokenAuth
|
|
811
|
-
] = self.config.get_tableau_auth(self.
|
|
826
|
+
] = self.config.get_tableau_auth(self.site_id)
|
|
812
827
|
self.server.auth.sign_in(tableau_auth)
|
|
813
828
|
|
|
814
829
|
@property
|
|
@@ -826,6 +841,7 @@ class TableauSiteSource:
|
|
|
826
841
|
if not view.id:
|
|
827
842
|
continue
|
|
828
843
|
self.tableau_stat_registry[view.id] = UsageStat(view_count=view.total_views)
|
|
844
|
+
logger.info(f"Got Tableau stats for {len(self.tableau_stat_registry)} assets")
|
|
829
845
|
logger.debug("Tableau stats %s", self.tableau_stat_registry)
|
|
830
846
|
|
|
831
847
|
def _populate_database_server_hostname_map(self) -> None:
|
|
@@ -876,7 +892,7 @@ class TableauSiteSource:
|
|
|
876
892
|
ancestors = [cur_proj.name]
|
|
877
893
|
while cur_proj.parent_id is not None:
|
|
878
894
|
if cur_proj.parent_id not in all_project_map:
|
|
879
|
-
self.report.
|
|
895
|
+
self.report.warning(
|
|
880
896
|
"project-issue",
|
|
881
897
|
f"Parent project {cur_proj.parent_id} not found. We need Site Administrator Explorer permissions.",
|
|
882
898
|
)
|
|
@@ -974,8 +990,11 @@ class TableauSiteSource:
|
|
|
974
990
|
self.datasource_project_map[ds.id] = ds.project_id
|
|
975
991
|
except Exception as e:
|
|
976
992
|
self.report.get_all_datasources_query_failed = True
|
|
977
|
-
|
|
978
|
-
|
|
993
|
+
self.report.warning(
|
|
994
|
+
title="Unexpected Query Error",
|
|
995
|
+
message="Get all datasources query failed due to error",
|
|
996
|
+
exc=e,
|
|
997
|
+
)
|
|
979
998
|
|
|
980
999
|
def _init_workbook_registry(self) -> None:
|
|
981
1000
|
if self.server is None:
|
|
@@ -1141,7 +1160,6 @@ class TableauSiteSource:
|
|
|
1141
1160
|
)
|
|
1142
1161
|
|
|
1143
1162
|
if node_limit_errors:
|
|
1144
|
-
logger.debug(f"Node Limit Error. query_data {query_data}")
|
|
1145
1163
|
self.report.warning(
|
|
1146
1164
|
title="Tableau Data Exceed Predefined Limit",
|
|
1147
1165
|
message="The numbers of record in result set exceeds a predefined limit. Increase the tableau "
|
|
@@ -1257,9 +1275,10 @@ class TableauSiteSource:
|
|
|
1257
1275
|
wrk_id: Optional[str] = workbook.get(c.ID)
|
|
1258
1276
|
prj_name: Optional[str] = workbook.get(c.PROJECT_NAME)
|
|
1259
1277
|
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1278
|
+
self.report.warning(
|
|
1279
|
+
title="Skipping Missing Workbook",
|
|
1280
|
+
message="Skipping workbook as its project is not present in project registry",
|
|
1281
|
+
context=f"workbook={wrk_name}({wrk_id}), project={prj_name}({project_luid})",
|
|
1263
1282
|
)
|
|
1264
1283
|
continue
|
|
1265
1284
|
|
|
@@ -1453,7 +1472,7 @@ class TableauSiteSource:
|
|
|
1453
1472
|
c.COLUMNS_CONNECTION
|
|
1454
1473
|
].get("totalCount")
|
|
1455
1474
|
if not is_custom_sql and not num_tbl_cols:
|
|
1456
|
-
logger.
|
|
1475
|
+
logger.warning(
|
|
1457
1476
|
f"Skipping upstream table with id {table[c.ID]}, no columns: {table}"
|
|
1458
1477
|
)
|
|
1459
1478
|
continue
|
|
@@ -1469,7 +1488,12 @@ class TableauSiteSource:
|
|
|
1469
1488
|
table, default_schema_map=self.config.default_schema_map
|
|
1470
1489
|
)
|
|
1471
1490
|
except Exception as e:
|
|
1472
|
-
|
|
1491
|
+
self.report.warning(
|
|
1492
|
+
title="Potentially Missing Lineage Issue",
|
|
1493
|
+
message="Failed to generate upstream reference",
|
|
1494
|
+
exc=e,
|
|
1495
|
+
context=f"table={table}",
|
|
1496
|
+
)
|
|
1473
1497
|
continue
|
|
1474
1498
|
|
|
1475
1499
|
table_urn = ref.make_dataset_urn(
|
|
@@ -1917,10 +1941,12 @@ class TableauSiteSource:
|
|
|
1917
1941
|
self.datasource_project_map[ds_result.id] = ds_result.project_id
|
|
1918
1942
|
except Exception as e:
|
|
1919
1943
|
self.report.num_get_datasource_query_failures += 1
|
|
1920
|
-
|
|
1921
|
-
|
|
1944
|
+
self.report.warning(
|
|
1945
|
+
title="Unexpected Query Error",
|
|
1946
|
+
message="Failed to get datasource details",
|
|
1947
|
+
exc=e,
|
|
1948
|
+
context=f"ds_luid={ds_luid}",
|
|
1922
1949
|
)
|
|
1923
|
-
logger.debug("Error stack trace", exc_info=True)
|
|
1924
1950
|
|
|
1925
1951
|
def _get_workbook_project_luid(self, wb: dict) -> Optional[str]:
|
|
1926
1952
|
if wb.get(c.LUID) and self.workbook_project_map.get(wb[c.LUID]):
|
|
@@ -3181,10 +3207,10 @@ class TableauSiteSource:
|
|
|
3181
3207
|
else:
|
|
3182
3208
|
# This is a root Tableau project since the parent_project_id is None.
|
|
3183
3209
|
# For a root project, either the site is the parent, or the platform is the default parent.
|
|
3184
|
-
if self.config.add_site_container
|
|
3210
|
+
if self.config.add_site_container:
|
|
3185
3211
|
# The site containers have already been generated by emit_site_container, so we
|
|
3186
3212
|
# don't need to emit them again here.
|
|
3187
|
-
parent_project_key = self.gen_site_key(self.
|
|
3213
|
+
parent_project_key = self.gen_site_key(self.site_id)
|
|
3188
3214
|
|
|
3189
3215
|
yield from gen_containers(
|
|
3190
3216
|
container_key=project_key,
|
|
@@ -3201,12 +3227,12 @@ class TableauSiteSource:
|
|
|
3201
3227
|
yield from emit_project_in_topological_order(project)
|
|
3202
3228
|
|
|
3203
3229
|
def emit_site_container(self):
|
|
3204
|
-
if not self.site
|
|
3230
|
+
if not self.site:
|
|
3205
3231
|
logger.warning("Can not ingest site container. No site information found.")
|
|
3206
3232
|
return
|
|
3207
3233
|
|
|
3208
3234
|
yield from gen_containers(
|
|
3209
|
-
container_key=self.gen_site_key(self.
|
|
3235
|
+
container_key=self.gen_site_key(self.site_id),
|
|
3210
3236
|
name=self.site.name or "Default",
|
|
3211
3237
|
sub_types=[c.SITE],
|
|
3212
3238
|
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|