acryl-datahub 0.15.0rc21__py3-none-any.whl → 0.15.0rc23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0rc21.dist-info → acryl_datahub-0.15.0rc23.dist-info}/METADATA +2411 -2411
- {acryl_datahub-0.15.0rc21.dist-info → acryl_datahub-0.15.0rc23.dist-info}/RECORD +19 -17
- datahub/__init__.py +1 -1
- datahub/cli/ingest_cli.py +110 -0
- datahub/configuration/git.py +7 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +5 -1
- datahub/ingestion/source/tableau/tableau.py +42 -3
- datahub/ingestion/source/tableau/tableau_common.py +12 -5
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/utilities/file_backed_collections.py +35 -2
- {acryl_datahub-0.15.0rc21.dist-info → acryl_datahub-0.15.0rc23.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.15.0rc21.dist-info → acryl_datahub-0.15.0rc23.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0rc21.dist-info → acryl_datahub-0.15.0rc23.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
datahub/__init__.py,sha256=
|
|
1
|
+
datahub/__init__.py,sha256=eOmo10Qg3UHdXM-mhXsProWUviox9Ng9kfUMS-B8xpo,575
|
|
2
2
|
datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
|
|
3
3
|
datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
|
|
4
4
|
datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -67,7 +67,7 @@ datahub/cli/docker_cli.py,sha256=QGoWFp8ZZsXOSMbgu0Q4snMmMmtP3epWAN-fYglUNEc,364
|
|
|
67
67
|
datahub/cli/env_utils.py,sha256=RQzjg4JE29hjPt4v7p-RuqoOr99w8E3DBHWiN2Sm7T4,252
|
|
68
68
|
datahub/cli/exists_cli.py,sha256=IsuU86R-g7BJjAl1vULH6d-BWJHAKa4XHLZl5WxGUEM,1233
|
|
69
69
|
datahub/cli/get_cli.py,sha256=VV80BCXfZ0-C8fr2k43SIuN9DB-fOYP9StWsTHnXwFw,2327
|
|
70
|
-
datahub/cli/ingest_cli.py,sha256=
|
|
70
|
+
datahub/cli/ingest_cli.py,sha256=nRoZvVpsGPXmEZCvSOBfsZ61Ep1fCqYRVp79RBnHSnI,22393
|
|
71
71
|
datahub/cli/json_file.py,sha256=nWo-VVthaaW4Do1eUqgrzk0fShb29MjiKXvZVOTq76c,943
|
|
72
72
|
datahub/cli/lite_cli.py,sha256=UmlMMquce6lHiPaKUBBT0XQtqR9SHEmrGlJyKV9YY60,13030
|
|
73
73
|
datahub/cli/migrate.py,sha256=p42vixwKzi9OHQnIa0K2FxwGvt-1OxXeuYGJzfu5Sqo,17939
|
|
@@ -93,7 +93,7 @@ datahub/configuration/common.py,sha256=Ngj2-HKPEhCMbcx3phUqyoOHayhqWNt1t0e2hO3GQ
|
|
|
93
93
|
datahub/configuration/config_loader.py,sha256=4V8rrbKvCbfEys2Tlw2uZXb3yC9Hpoubn2O8GXhGe3A,5785
|
|
94
94
|
datahub/configuration/connection_resolver.py,sha256=n4-6MwMiOEDgTouxO0SMjTILKVhJPo6-naE6FuR5qMs,1516
|
|
95
95
|
datahub/configuration/datetimes.py,sha256=nayNc0mmlVKH6oVv9ud6C1dDUiZPGabW-YZxvrkosPg,2870
|
|
96
|
-
datahub/configuration/git.py,sha256=
|
|
96
|
+
datahub/configuration/git.py,sha256=q9iac6cc6oZ3RVSPTyuR2VMsmt2wr-uVaCLWohdKVV0,6461
|
|
97
97
|
datahub/configuration/import_resolver.py,sha256=b4Ie9L7knN1LALEVMxTcNFSklDD6CVE-4Ipy4ZYhNYA,369
|
|
98
98
|
datahub/configuration/json_loader.py,sha256=vIDnjwXWi9yHDO8KW64EupOzOb_sspehGCD7xGHzg84,302
|
|
99
99
|
datahub/configuration/kafka.py,sha256=MlIwpd5FFyOyjdDXW_X9JTLNk7f988sPMgevkcZYVgI,2579
|
|
@@ -266,9 +266,9 @@ datahub/ingestion/source/data_lake_common/path_spec.py,sha256=u3u2eMe70V5vur-j8m
|
|
|
266
266
|
datahub/ingestion/source/datahub/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
267
267
|
datahub/ingestion/source/datahub/config.py,sha256=pOXt0b1PX6D7dtD4RuKwdmr6sQKnXSf6LHxfPUMhP8s,3658
|
|
268
268
|
datahub/ingestion/source/datahub/datahub_api_reader.py,sha256=hlKADVEPoTFiRGKqRsMF5mL4fSu_IrIW8Nx7LpEzvkM,2134
|
|
269
|
-
datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=
|
|
269
|
+
datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=F8JrOjSrmJ2B6m1MWh83A1EYFDcGMla749HUeQWMnL0,9464
|
|
270
270
|
datahub/ingestion/source/datahub/datahub_kafka_reader.py,sha256=8x9_u5kRjgSmu7c295ZIZjxP6bgoZZbWsKRicuLStRQ,4145
|
|
271
|
-
datahub/ingestion/source/datahub/datahub_source.py,sha256=
|
|
271
|
+
datahub/ingestion/source/datahub/datahub_source.py,sha256=2jDnsHEzpGhr00qQI9unSUJYD6Cb1McYFKOVbA-Zcm4,8487
|
|
272
272
|
datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vBCU0XxGcZR6Xxs,940
|
|
273
273
|
datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
|
|
274
274
|
datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -398,7 +398,7 @@ datahub/ingestion/source/s3/config.py,sha256=Zs1nrBZKLImteZreIcSMMRLj8vBGgxakNDs
|
|
|
398
398
|
datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=k7S9Xcmgr3-CvWrd5NEX-V8JSrcAwkm7vbHPTVZicow,3620
|
|
399
399
|
datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
|
|
400
400
|
datahub/ingestion/source/s3/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm0cdKD-Xgw,542
|
|
401
|
-
datahub/ingestion/source/s3/source.py,sha256=
|
|
401
|
+
datahub/ingestion/source/s3/source.py,sha256=8O_vu1J91h7owQlYyK27AZAQHxKsDpNC_jsLNpMed98,47336
|
|
402
402
|
datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
403
403
|
datahub/ingestion/source/sac/sac.py,sha256=zPSO9ukuyhvNaaVzeAYpA-_sFma_XMcCQMPaGvDWuTk,30226
|
|
404
404
|
datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
|
|
@@ -427,7 +427,7 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
|
|
|
427
427
|
datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
|
|
428
428
|
datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
|
|
429
429
|
datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
|
|
430
|
-
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=
|
|
430
|
+
datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=suMICPFPvoV6shkjD_14JunLc8jAZBINzlFk2mYldkU,23676
|
|
431
431
|
datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
|
|
432
432
|
datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
|
|
433
433
|
datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
|
|
@@ -486,9 +486,11 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
|
|
|
486
486
|
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
|
|
487
487
|
datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
|
|
488
488
|
datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
489
|
-
datahub/ingestion/source/tableau/tableau.py,sha256=
|
|
490
|
-
datahub/ingestion/source/tableau/tableau_common.py,sha256=
|
|
491
|
-
datahub/ingestion/source/tableau/tableau_constant.py,sha256=
|
|
489
|
+
datahub/ingestion/source/tableau/tableau.py,sha256=P_DUuUvXk5u2ihA0JghtRkYc_KI_yQR2ZiQVe9IUvsU,138197
|
|
490
|
+
datahub/ingestion/source/tableau/tableau_common.py,sha256=9gQLq_3BlAsKll83uVlnWJRWaIDtFtREUyuimXF13Z0,26219
|
|
491
|
+
datahub/ingestion/source/tableau/tableau_constant.py,sha256=jVQMgLXND5aPL6XLETKp81BehRkvyLTU_Vhhe_1NOkI,2576
|
|
492
|
+
datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=PEGfcoUcBdsnOa5EzCqy1IiuQ3OZ9fZVEMzDqhhHOto,922
|
|
493
|
+
datahub/ingestion/source/tableau/tableau_validation.py,sha256=l0DuXUuxJwEXMzo61xLx-KLc5u6tiz2n0e9EepJdWEM,1808
|
|
492
494
|
datahub/ingestion/source/unity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
493
495
|
datahub/ingestion/source/unity/analyze_profiler.py,sha256=2pqkFY30CfN4aHgFZZntjeG0hNhBytZJvXC13VfTc1I,4689
|
|
494
496
|
datahub/ingestion/source/unity/config.py,sha256=m4-n7mYz4Ct4L1QdfJFklwHyj8boKCbV7Sb3Ou6AT3Q,14756
|
|
@@ -882,7 +884,7 @@ datahub/testing/__init__.py,sha256=TywIuzGQvzJsNhI_PGD1RFk11M3RtGl9jIMtAVVHIkg,2
|
|
|
882
884
|
datahub/testing/check_imports.py,sha256=EKuJmgUA46uOrlaOy0fCvPB7j9POkpJ0ExhO_pT3YAk,1356
|
|
883
885
|
datahub/testing/check_sql_parser_result.py,sha256=f7U7IUSbfV4VACdNI857wPZ9tAZ9j6mXiXmcJNT_RzM,2671
|
|
884
886
|
datahub/testing/check_str_enum.py,sha256=yqk0XXHOGteN-IGqCp5JHy0Kca13BnI09ZqKc4Nwl3E,1187
|
|
885
|
-
datahub/testing/compare_metadata_json.py,sha256=
|
|
887
|
+
datahub/testing/compare_metadata_json.py,sha256=pVJB2qLoKzEJLBXqFT-qGrxpA1y76y-mIbvJf0NnAD0,5274
|
|
886
888
|
datahub/testing/docker_utils.py,sha256=g169iy_jNR_mg0p8X31cChZqjOryutAIHUYLq3xqueY,2415
|
|
887
889
|
datahub/testing/doctest.py,sha256=1_8WEhHZ2eRQtw8vsXKzr9L5zzvs0Tcr6q4mnkyyvtw,295
|
|
888
890
|
datahub/testing/mcp_diff.py,sha256=_sBFhmclYXJGQ_JYDrvKWXNGXt9ACvqeQvFaZrRHa8Q,10729
|
|
@@ -900,7 +902,7 @@ datahub/utilities/dedup_list.py,sha256=dUSpe1AajfuwlHVJKNv-CzDXSCkaw0HgSMOsxqUkQ
|
|
|
900
902
|
datahub/utilities/delayed_iter.py,sha256=XlsI0DCXkVVejFKOW_uMT0E8DTqqOHQN3Ooak4EcULE,645
|
|
901
903
|
datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,1128
|
|
902
904
|
datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
|
|
903
|
-
datahub/utilities/file_backed_collections.py,sha256=
|
|
905
|
+
datahub/utilities/file_backed_collections.py,sha256=I2GxSYtVzfo38pQpv2FyoBeWISiKD4zUi0t34jPCNrU,21957
|
|
904
906
|
datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
|
|
905
907
|
datahub/utilities/hive_schema_to_avro.py,sha256=2-7NI9haCAYbyUmHTb-QPxjn4WbmnDDKAIGmHJ11-1E,11622
|
|
906
908
|
datahub/utilities/is_pytest.py,sha256=2m9T4S9IIKhI5RfTqrB2ZmumzHocdxBHpM1HroWj2XQ,138
|
|
@@ -974,8 +976,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
|
|
|
974
976
|
datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
|
|
975
977
|
datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
|
|
976
978
|
datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
|
|
977
|
-
acryl_datahub-0.15.
|
|
978
|
-
acryl_datahub-0.15.
|
|
979
|
-
acryl_datahub-0.15.
|
|
980
|
-
acryl_datahub-0.15.
|
|
981
|
-
acryl_datahub-0.15.
|
|
979
|
+
acryl_datahub-0.15.0rc23.dist-info/METADATA,sha256=cPlJko8JF1pZEIihXKsAct2ai4okUHAMu8e3sAha7mU,173559
|
|
980
|
+
acryl_datahub-0.15.0rc23.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
|
981
|
+
acryl_datahub-0.15.0rc23.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
|
|
982
|
+
acryl_datahub-0.15.0rc23.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
|
|
983
|
+
acryl_datahub-0.15.0rc23.dist-info/RECORD,,
|
datahub/__init__.py
CHANGED
datahub/cli/ingest_cli.py
CHANGED
|
@@ -27,6 +27,7 @@ from datahub.utilities.perf_timer import PerfTimer
|
|
|
27
27
|
|
|
28
28
|
logger = logging.getLogger(__name__)
|
|
29
29
|
|
|
30
|
+
INGEST_SRC_TABLE_COLUMNS = ["runId", "source", "startTime", "status", "URN"]
|
|
30
31
|
RUNS_TABLE_COLUMNS = ["runId", "rows", "created at"]
|
|
31
32
|
RUN_TABLE_COLUMNS = ["urn", "aspect name", "created at"]
|
|
32
33
|
|
|
@@ -437,6 +438,115 @@ def mcps(path: str) -> None:
|
|
|
437
438
|
sys.exit(ret)
|
|
438
439
|
|
|
439
440
|
|
|
441
|
+
@ingest.command()
|
|
442
|
+
@click.argument("page_offset", type=int, default=0)
|
|
443
|
+
@click.argument("page_size", type=int, default=100)
|
|
444
|
+
@click.option("--urn", type=str, default=None, help="Filter by ingestion source URN.")
|
|
445
|
+
@click.option(
|
|
446
|
+
"--source", type=str, default=None, help="Filter by ingestion source name."
|
|
447
|
+
)
|
|
448
|
+
@upgrade.check_upgrade
|
|
449
|
+
@telemetry.with_telemetry()
|
|
450
|
+
def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) -> None:
|
|
451
|
+
"""List ingestion source runs with their details, optionally filtered by URN or source."""
|
|
452
|
+
|
|
453
|
+
query = """
|
|
454
|
+
query listIngestionRuns($input: ListIngestionSourcesInput!) {
|
|
455
|
+
listIngestionSources(input: $input) {
|
|
456
|
+
ingestionSources {
|
|
457
|
+
urn
|
|
458
|
+
name
|
|
459
|
+
executions {
|
|
460
|
+
executionRequests {
|
|
461
|
+
id
|
|
462
|
+
result {
|
|
463
|
+
startTimeMs
|
|
464
|
+
status
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
"""
|
|
472
|
+
|
|
473
|
+
# filter by urn and/or source using CONTAINS
|
|
474
|
+
filters = []
|
|
475
|
+
if urn:
|
|
476
|
+
filters.append({"field": "urn", "values": [urn], "condition": "CONTAIN"})
|
|
477
|
+
if source:
|
|
478
|
+
filters.append({"field": "name", "values": [source], "condition": "CONTAIN"})
|
|
479
|
+
|
|
480
|
+
variables = {
|
|
481
|
+
"input": {
|
|
482
|
+
"start": page_offset,
|
|
483
|
+
"count": page_size,
|
|
484
|
+
"filters": filters,
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
client = get_default_graph()
|
|
489
|
+
session = client._session
|
|
490
|
+
gms_host = client.config.server
|
|
491
|
+
|
|
492
|
+
url = f"{gms_host}/api/graphql"
|
|
493
|
+
try:
|
|
494
|
+
response = session.post(url, json={"query": query, "variables": variables})
|
|
495
|
+
response.raise_for_status()
|
|
496
|
+
except Exception as e:
|
|
497
|
+
click.echo(f"Error fetching data: {str(e)}")
|
|
498
|
+
return
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
data = response.json()
|
|
502
|
+
except ValueError:
|
|
503
|
+
click.echo("Failed to parse JSON response from server.")
|
|
504
|
+
return
|
|
505
|
+
|
|
506
|
+
if not data:
|
|
507
|
+
click.echo("No response received from the server.")
|
|
508
|
+
return
|
|
509
|
+
|
|
510
|
+
# when urn or source filter does not match, exit gracefully
|
|
511
|
+
if (
|
|
512
|
+
not isinstance(data.get("data"), dict)
|
|
513
|
+
or "listIngestionSources" not in data["data"]
|
|
514
|
+
):
|
|
515
|
+
click.echo("No matching ingestion sources found. Please check your filters.")
|
|
516
|
+
return
|
|
517
|
+
|
|
518
|
+
ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"]
|
|
519
|
+
if not ingestion_sources:
|
|
520
|
+
click.echo("No ingestion sources or executions found.")
|
|
521
|
+
return
|
|
522
|
+
|
|
523
|
+
rows = []
|
|
524
|
+
for ingestion_source in ingestion_sources:
|
|
525
|
+
urn = ingestion_source.get("urn", "N/A")
|
|
526
|
+
name = ingestion_source.get("name", "N/A")
|
|
527
|
+
|
|
528
|
+
executions = ingestion_source.get("executions", {}).get("executionRequests", [])
|
|
529
|
+
for execution in executions:
|
|
530
|
+
execution_id = execution.get("id", "N/A")
|
|
531
|
+
start_time = execution.get("result", {}).get("startTimeMs", "N/A")
|
|
532
|
+
start_time = (
|
|
533
|
+
datetime.fromtimestamp(start_time / 1000).strftime("%Y-%m-%d %H:%M:%S")
|
|
534
|
+
if start_time != "N/A"
|
|
535
|
+
else "N/A"
|
|
536
|
+
)
|
|
537
|
+
status = execution.get("result", {}).get("status", "N/A")
|
|
538
|
+
|
|
539
|
+
rows.append([execution_id, name, start_time, status, urn])
|
|
540
|
+
|
|
541
|
+
click.echo(
|
|
542
|
+
tabulate(
|
|
543
|
+
rows,
|
|
544
|
+
headers=INGEST_SRC_TABLE_COLUMNS,
|
|
545
|
+
tablefmt="grid",
|
|
546
|
+
)
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
|
|
440
550
|
@ingest.command()
|
|
441
551
|
@click.argument("page_offset", type=int, default=0)
|
|
442
552
|
@click.argument("page_size", type=int, default=100)
|
datahub/configuration/git.py
CHANGED
|
@@ -24,7 +24,11 @@ class GitReference(ConfigModel):
|
|
|
24
24
|
"main",
|
|
25
25
|
description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.",
|
|
26
26
|
)
|
|
27
|
-
|
|
27
|
+
url_subdir: Optional[str] = Field(
|
|
28
|
+
default=None,
|
|
29
|
+
description="Prefix to prepend when generating URLs for files - useful when files are in a subdirectory. "
|
|
30
|
+
"Only affects URL generation, not git operations.",
|
|
31
|
+
)
|
|
28
32
|
url_template: Optional[str] = Field(
|
|
29
33
|
None,
|
|
30
34
|
description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required."
|
|
@@ -68,6 +72,8 @@ class GitReference(ConfigModel):
|
|
|
68
72
|
|
|
69
73
|
def get_url_for_file_path(self, file_path: str) -> str:
|
|
70
74
|
assert self.url_template
|
|
75
|
+
if self.url_subdir:
|
|
76
|
+
file_path = f"{self.url_subdir}/{file_path}"
|
|
71
77
|
return self.url_template.format(
|
|
72
78
|
repo_url=self.repo, branch=self.branch, file_path=file_path
|
|
73
79
|
)
|
|
@@ -147,6 +147,47 @@ class DataHubDatabaseReader:
|
|
|
147
147
|
version
|
|
148
148
|
"""
|
|
149
149
|
|
|
150
|
+
def execute_server_cursor(
|
|
151
|
+
self, query: str, params: Dict[str, Any]
|
|
152
|
+
) -> Iterable[Dict[str, Any]]:
|
|
153
|
+
with self.engine.connect() as conn:
|
|
154
|
+
if self.engine.dialect.name == "postgresql":
|
|
155
|
+
with conn.begin(): # Transaction required for PostgreSQL server-side cursor
|
|
156
|
+
conn = conn.execution_options(
|
|
157
|
+
stream_results=True,
|
|
158
|
+
yield_per=self.config.database_query_batch_size,
|
|
159
|
+
)
|
|
160
|
+
result = conn.execute(query, params)
|
|
161
|
+
for row in result:
|
|
162
|
+
yield dict(row)
|
|
163
|
+
elif self.engine.dialect.name == "mysql": # MySQL
|
|
164
|
+
import MySQLdb
|
|
165
|
+
|
|
166
|
+
with contextlib.closing(
|
|
167
|
+
conn.connection.cursor(MySQLdb.cursors.SSCursor)
|
|
168
|
+
) as cursor:
|
|
169
|
+
logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
|
|
170
|
+
cursor.execute(query, params)
|
|
171
|
+
|
|
172
|
+
columns = [desc[0] for desc in cursor.description]
|
|
173
|
+
while True:
|
|
174
|
+
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
175
|
+
if not rows:
|
|
176
|
+
break # Use break instead of return in generator
|
|
177
|
+
for row in rows:
|
|
178
|
+
yield dict(zip(columns, row))
|
|
179
|
+
else:
|
|
180
|
+
raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
|
|
181
|
+
|
|
182
|
+
def _get_rows(
|
|
183
|
+
self, from_createdon: datetime, stop_time: datetime
|
|
184
|
+
) -> Iterable[Dict[str, Any]]:
|
|
185
|
+
params = {
|
|
186
|
+
"exclude_aspects": list(self.config.exclude_aspects),
|
|
187
|
+
"since_createdon": from_createdon.strftime(DATETIME_FORMAT),
|
|
188
|
+
}
|
|
189
|
+
yield from self.execute_server_cursor(self.query, params)
|
|
190
|
+
|
|
150
191
|
def get_aspects(
|
|
151
192
|
self, from_createdon: datetime, stop_time: datetime
|
|
152
193
|
) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
|
|
@@ -159,27 +200,6 @@ class DataHubDatabaseReader:
|
|
|
159
200
|
if mcp:
|
|
160
201
|
yield mcp, row["createdon"]
|
|
161
202
|
|
|
162
|
-
def _get_rows(
|
|
163
|
-
self, from_createdon: datetime, stop_time: datetime
|
|
164
|
-
) -> Iterable[Dict[str, Any]]:
|
|
165
|
-
with self.engine.connect() as conn:
|
|
166
|
-
with contextlib.closing(conn.connection.cursor()) as cursor:
|
|
167
|
-
cursor.execute(
|
|
168
|
-
self.query,
|
|
169
|
-
{
|
|
170
|
-
"exclude_aspects": list(self.config.exclude_aspects),
|
|
171
|
-
"since_createdon": from_createdon.strftime(DATETIME_FORMAT),
|
|
172
|
-
},
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
columns = [desc[0] for desc in cursor.description]
|
|
176
|
-
while True:
|
|
177
|
-
rows = cursor.fetchmany(self.config.database_query_batch_size)
|
|
178
|
-
if not rows:
|
|
179
|
-
return
|
|
180
|
-
for row in rows:
|
|
181
|
-
yield dict(zip(columns, row))
|
|
182
|
-
|
|
183
203
|
def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
|
|
184
204
|
"""
|
|
185
205
|
Fetches all soft-deleted entities from the database.
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from datetime import datetime, timezone
|
|
2
|
+
from datetime import datetime, timedelta, timezone
|
|
3
3
|
from functools import partial
|
|
4
4
|
from typing import Dict, Iterable, List, Optional
|
|
5
5
|
|
|
@@ -26,6 +26,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
|
|
26
26
|
StatefulIngestionSourceBase,
|
|
27
27
|
)
|
|
28
28
|
from datahub.metadata.schema_classes import ChangeTypeClass
|
|
29
|
+
from datahub.utilities.progress_timer import ProgressTimer
|
|
29
30
|
|
|
30
31
|
logger = logging.getLogger(__name__)
|
|
31
32
|
|
|
@@ -105,11 +106,17 @@ class DataHubSource(StatefulIngestionSourceBase):
|
|
|
105
106
|
self, from_createdon: datetime, reader: DataHubDatabaseReader
|
|
106
107
|
) -> Iterable[MetadataWorkUnit]:
|
|
107
108
|
logger.info(f"Fetching database aspects starting from {from_createdon}")
|
|
109
|
+
progress = ProgressTimer(report_every=timedelta(seconds=60))
|
|
108
110
|
mcps = reader.get_aspects(from_createdon, self.report.stop_time)
|
|
109
111
|
for i, (mcp, createdon) in enumerate(mcps):
|
|
110
112
|
if not self.urn_pattern.allowed(str(mcp.entityUrn)):
|
|
111
113
|
continue
|
|
112
114
|
|
|
115
|
+
if progress.should_report():
|
|
116
|
+
logger.info(
|
|
117
|
+
f"Ingested {i} database aspects so far, currently at {createdon}"
|
|
118
|
+
)
|
|
119
|
+
|
|
113
120
|
yield mcp.as_workunit()
|
|
114
121
|
self.report.num_database_aspects_ingested += 1
|
|
115
122
|
|
|
@@ -9,6 +9,7 @@ from datetime import datetime
|
|
|
9
9
|
from itertools import groupby
|
|
10
10
|
from pathlib import PurePath
|
|
11
11
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
12
|
+
from urllib.parse import urlparse
|
|
12
13
|
|
|
13
14
|
import smart_open.compression as so_compression
|
|
14
15
|
from more_itertools import peekable
|
|
@@ -993,9 +994,7 @@ class S3Source(StatefulIngestionSourceBase):
|
|
|
993
994
|
folders = []
|
|
994
995
|
for dir in dirs_to_process:
|
|
995
996
|
logger.info(f"Getting files from folder: {dir}")
|
|
996
|
-
prefix_to_process = dir.
|
|
997
|
-
self.create_s3_path(bucket_name, "/")
|
|
998
|
-
)
|
|
997
|
+
prefix_to_process = urlparse(dir).path.lstrip("/")
|
|
999
998
|
|
|
1000
999
|
folders.extend(
|
|
1001
1000
|
self.get_folder_info(
|
|
@@ -414,9 +414,13 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
|
|
|
414
414
|
except Exception as e:
|
|
415
415
|
self.report.num_upstream_lineage_edge_parsing_failed += 1
|
|
416
416
|
upstream_tables = db_row.get("UPSTREAM_TABLES")
|
|
417
|
+
downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME")
|
|
417
418
|
self.structured_reporter.warning(
|
|
418
419
|
"Failed to parse lineage edge",
|
|
419
|
-
|
|
420
|
+
# Tricky: sometimes the full row data is too large, and so the context
|
|
421
|
+
# message gets truncated. By pulling out the upstreams and downstream
|
|
422
|
+
# list, we can at least get the important fields if truncation does occur.
|
|
423
|
+
context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}",
|
|
420
424
|
exc=e,
|
|
421
425
|
)
|
|
422
426
|
return None
|
|
@@ -111,6 +111,8 @@ from datahub.ingestion.source.tableau.tableau_common import (
|
|
|
111
111
|
tableau_field_to_schema_field,
|
|
112
112
|
workbook_graphql_query,
|
|
113
113
|
)
|
|
114
|
+
from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
|
|
115
|
+
from datahub.ingestion.source.tableau.tableau_validation import check_user_role
|
|
114
116
|
from datahub.metadata.com.linkedin.pegasus2avro.common import (
|
|
115
117
|
AuditStamp,
|
|
116
118
|
ChangeAuditStamps,
|
|
@@ -167,7 +169,7 @@ from datahub.utilities.urns.dataset_urn import DatasetUrn
|
|
|
167
169
|
|
|
168
170
|
try:
|
|
169
171
|
# On earlier versions of the tableauserverclient, the NonXMLResponseError
|
|
170
|
-
# was thrown when reauthentication was
|
|
172
|
+
# was thrown when reauthentication was necessary. We'll keep both exceptions
|
|
171
173
|
# around for now, but can remove this in the future.
|
|
172
174
|
from tableauserverclient.server.endpoint.exceptions import ( # type: ignore
|
|
173
175
|
NotSignedInError,
|
|
@@ -632,6 +634,33 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
|
|
|
632
634
|
num_upstream_table_lineage_failed_parse_sql: int = 0
|
|
633
635
|
num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
|
|
634
636
|
num_hidden_assets_skipped: int = 0
|
|
637
|
+
logged_in_user: List[UserInfo] = []
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def report_user_role(report: TableauSourceReport, server: Server) -> None:
|
|
641
|
+
title: str = "Insufficient Permissions"
|
|
642
|
+
message: str = "The user must have the `Site Administrator Explorer` role to perform metadata ingestion."
|
|
643
|
+
try:
|
|
644
|
+
# TableauSiteSource instance is per site, so each time we need to find-out user detail
|
|
645
|
+
# the site-role might be different on another site
|
|
646
|
+
logged_in_user: UserInfo = UserInfo.from_server(server=server)
|
|
647
|
+
|
|
648
|
+
if not logged_in_user.is_site_administrator_explorer():
|
|
649
|
+
report.warning(
|
|
650
|
+
title=title,
|
|
651
|
+
message=message,
|
|
652
|
+
context=f"user-name={logged_in_user.user_name}, role={logged_in_user.site_role}, site_id={logged_in_user.site_id}",
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
report.logged_in_user.append(logged_in_user)
|
|
656
|
+
|
|
657
|
+
except Exception as e:
|
|
658
|
+
report.warning(
|
|
659
|
+
title=title,
|
|
660
|
+
message="Failed to verify the user's role. The user must have `Site Administrator Explorer` role.",
|
|
661
|
+
context=f"{e}",
|
|
662
|
+
exc=e,
|
|
663
|
+
)
|
|
635
664
|
|
|
636
665
|
|
|
637
666
|
@platform_name("Tableau")
|
|
@@ -676,6 +705,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
676
705
|
try:
|
|
677
706
|
logger.info(f"Authenticated to Tableau site: '{site_content_url}'")
|
|
678
707
|
self.server = self.config.make_tableau_client(site_content_url)
|
|
708
|
+
report_user_role(report=self.report, server=self.server)
|
|
679
709
|
# Note that we're not catching ConfigurationError, since we want that to throw.
|
|
680
710
|
except ValueError as e:
|
|
681
711
|
self.report.failure(
|
|
@@ -689,9 +719,17 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
689
719
|
test_report = TestConnectionReport()
|
|
690
720
|
try:
|
|
691
721
|
source_config = TableauConfig.parse_obj_allow_extras(config_dict)
|
|
692
|
-
|
|
722
|
+
|
|
723
|
+
server = source_config.make_tableau_client(source_config.site)
|
|
724
|
+
|
|
693
725
|
test_report.basic_connectivity = CapabilityReport(capable=True)
|
|
726
|
+
|
|
727
|
+
test_report.capability_report = check_user_role(
|
|
728
|
+
logged_in_user=UserInfo.from_server(server=server)
|
|
729
|
+
)
|
|
730
|
+
|
|
694
731
|
except Exception as e:
|
|
732
|
+
logger.warning(f"{e}", exc_info=e)
|
|
695
733
|
test_report.basic_connectivity = CapabilityReport(
|
|
696
734
|
capable=False, failure_reason=str(e)
|
|
697
735
|
)
|
|
@@ -831,6 +869,8 @@ class TableauSiteSource:
|
|
|
831
869
|
# when emitting custom SQL data sources.
|
|
832
870
|
self.custom_sql_ids_being_used: List[str] = []
|
|
833
871
|
|
|
872
|
+
report_user_role(report=report, server=server)
|
|
873
|
+
|
|
834
874
|
@property
|
|
835
875
|
def no_env_browse_prefix(self) -> str:
|
|
836
876
|
# Prefix to use with browse path (v1)
|
|
@@ -1290,7 +1330,6 @@ class TableauSiteSource:
|
|
|
1290
1330
|
page_size = page_size_override or self.config.page_size
|
|
1291
1331
|
|
|
1292
1332
|
filter_pages = get_filter_pages(query_filter, page_size)
|
|
1293
|
-
|
|
1294
1333
|
for filter_page in filter_pages:
|
|
1295
1334
|
has_next_page = 1
|
|
1296
1335
|
current_cursor: Optional[str] = None
|
|
@@ -975,15 +975,22 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
|
|
|
975
975
|
# a few ten thousand, then tableau server responds with empty response
|
|
976
976
|
# causing below error:
|
|
977
977
|
# tableauserverclient.server.endpoint.exceptions.NonXMLResponseError: b''
|
|
978
|
+
|
|
979
|
+
# in practice, we only do pagination if len(query_filter.keys()) == 1
|
|
980
|
+
if len(query_filter.keys()) != 1:
|
|
981
|
+
return filter_pages
|
|
982
|
+
|
|
983
|
+
current_key = (list(query_filter.keys()))[0]
|
|
984
|
+
|
|
978
985
|
if (
|
|
979
|
-
|
|
980
|
-
and query_filter.get(
|
|
981
|
-
and isinstance(query_filter[
|
|
986
|
+
current_key in [c.ID_WITH_IN, c.PROJECT_NAME_WITH_IN]
|
|
987
|
+
and query_filter.get(current_key)
|
|
988
|
+
and isinstance(query_filter[current_key], list)
|
|
982
989
|
):
|
|
983
|
-
ids = query_filter[
|
|
990
|
+
ids = query_filter[current_key]
|
|
984
991
|
filter_pages = [
|
|
985
992
|
{
|
|
986
|
-
|
|
993
|
+
current_key: ids[
|
|
987
994
|
start : (
|
|
988
995
|
start + page_size if start + page_size < len(ids) else len(ids)
|
|
989
996
|
)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from tableauserverclient import Server, UserItem
|
|
4
|
+
|
|
5
|
+
from datahub.ingestion.source.tableau import tableau_constant as c
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class UserInfo:
|
|
10
|
+
user_name: str
|
|
11
|
+
site_role: str
|
|
12
|
+
site_id: str
|
|
13
|
+
|
|
14
|
+
def is_site_administrator_explorer(self):
|
|
15
|
+
return self.site_role == c.SITE_ROLE
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def from_server(server: Server) -> "UserInfo":
|
|
19
|
+
assert server.user_id, "make the connection with tableau"
|
|
20
|
+
|
|
21
|
+
user: UserItem = server.users.get_by_id(server.user_id)
|
|
22
|
+
|
|
23
|
+
assert user.site_role, "site_role is not available" # to silent the lint
|
|
24
|
+
|
|
25
|
+
assert user.name, "user name is not available" # to silent the lint
|
|
26
|
+
|
|
27
|
+
assert server.site_id, "site identifier is not available" # to silent the lint
|
|
28
|
+
|
|
29
|
+
return UserInfo(
|
|
30
|
+
user_name=user.name,
|
|
31
|
+
site_role=user.site_role,
|
|
32
|
+
site_id=server.site_id,
|
|
33
|
+
)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Dict, Union
|
|
3
|
+
|
|
4
|
+
from datahub.ingestion.api.source import CapabilityReport, SourceCapability
|
|
5
|
+
from datahub.ingestion.source.tableau import tableau_constant as c
|
|
6
|
+
from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def check_user_role(
|
|
12
|
+
logged_in_user: UserInfo,
|
|
13
|
+
) -> Dict[Union[SourceCapability, str], CapabilityReport]:
|
|
14
|
+
capability_dict: Dict[Union[SourceCapability, str], CapabilityReport] = {
|
|
15
|
+
c.SITE_PERMISSION: CapabilityReport(
|
|
16
|
+
capable=True,
|
|
17
|
+
)
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
failure_reason: str = (
|
|
21
|
+
"The user does not have the `Site Administrator Explorer` role."
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
mitigation_message_prefix: str = (
|
|
25
|
+
"Assign `Site Administrator Explorer` role to the user"
|
|
26
|
+
)
|
|
27
|
+
mitigation_message_suffix: str = "Refer to the setup guide: https://datahubproject.io/docs/quick-ingestion-guides/tableau/setup"
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
# TODO: Add check for `Enable Derived Permissions`
|
|
31
|
+
if not logged_in_user.is_site_administrator_explorer():
|
|
32
|
+
capability_dict[c.SITE_PERMISSION] = CapabilityReport(
|
|
33
|
+
capable=False,
|
|
34
|
+
failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.",
|
|
35
|
+
mitigation_message=f"{mitigation_message_prefix} `{logged_in_user.user_name}`. {mitigation_message_suffix}",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
return capability_dict
|
|
39
|
+
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.warning(msg=e, exc_info=e)
|
|
42
|
+
capability_dict[c.SITE_PERMISSION] = CapabilityReport(
|
|
43
|
+
capable=False,
|
|
44
|
+
failure_reason="Failed to verify user role.",
|
|
45
|
+
mitigation_message=f"{mitigation_message_prefix}. {mitigation_message_suffix}", # user is unknown
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return capability_dict
|
|
@@ -117,7 +117,7 @@ def diff_metadata_json(
|
|
|
117
117
|
ignore_paths: Sequence[str] = (),
|
|
118
118
|
ignore_order: bool = True,
|
|
119
119
|
) -> Union[DeepDiff, MCPDiff]:
|
|
120
|
-
ignore_paths =
|
|
120
|
+
ignore_paths = [*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info"]
|
|
121
121
|
try:
|
|
122
122
|
if ignore_order:
|
|
123
123
|
golden_map = get_aspects_by_urn(golden)
|