acryl-datahub 0.15.0rc21__py3-none-any.whl → 0.15.0rc22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of acryl-datahub might be problematic. Click here for more details.

@@ -1,4 +1,4 @@
1
- datahub/__init__.py,sha256=caUPlyD6P05EsMKzRYtlTS611d82sT4szr8_WAu_rJ4,575
1
+ datahub/__init__.py,sha256=T0tNQ0v5Y2QyvLqZg1tU0kxvIjYvmZ8eZdrD_d8Uwe4,575
2
2
  datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
3
3
  datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
4
4
  datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -67,7 +67,7 @@ datahub/cli/docker_cli.py,sha256=QGoWFp8ZZsXOSMbgu0Q4snMmMmtP3epWAN-fYglUNEc,364
67
67
  datahub/cli/env_utils.py,sha256=RQzjg4JE29hjPt4v7p-RuqoOr99w8E3DBHWiN2Sm7T4,252
68
68
  datahub/cli/exists_cli.py,sha256=IsuU86R-g7BJjAl1vULH6d-BWJHAKa4XHLZl5WxGUEM,1233
69
69
  datahub/cli/get_cli.py,sha256=VV80BCXfZ0-C8fr2k43SIuN9DB-fOYP9StWsTHnXwFw,2327
70
- datahub/cli/ingest_cli.py,sha256=miFXBUm9xD8vRvKPwpB-3GXKV1Abf8xtPWyxV6UeenM,18983
70
+ datahub/cli/ingest_cli.py,sha256=nRoZvVpsGPXmEZCvSOBfsZ61Ep1fCqYRVp79RBnHSnI,22393
71
71
  datahub/cli/json_file.py,sha256=nWo-VVthaaW4Do1eUqgrzk0fShb29MjiKXvZVOTq76c,943
72
72
  datahub/cli/lite_cli.py,sha256=UmlMMquce6lHiPaKUBBT0XQtqR9SHEmrGlJyKV9YY60,13030
73
73
  datahub/cli/migrate.py,sha256=p42vixwKzi9OHQnIa0K2FxwGvt-1OxXeuYGJzfu5Sqo,17939
@@ -266,9 +266,9 @@ datahub/ingestion/source/data_lake_common/path_spec.py,sha256=u3u2eMe70V5vur-j8m
266
266
  datahub/ingestion/source/datahub/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
267
267
  datahub/ingestion/source/datahub/config.py,sha256=pOXt0b1PX6D7dtD4RuKwdmr6sQKnXSf6LHxfPUMhP8s,3658
268
268
  datahub/ingestion/source/datahub/datahub_api_reader.py,sha256=hlKADVEPoTFiRGKqRsMF5mL4fSu_IrIW8Nx7LpEzvkM,2134
269
- datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=TLH1KMyvRgiuENr8t0lnBjCxggONsDrxYThRzdNVEuE,8458
269
+ datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=F8JrOjSrmJ2B6m1MWh83A1EYFDcGMla749HUeQWMnL0,9464
270
270
  datahub/ingestion/source/datahub/datahub_kafka_reader.py,sha256=8x9_u5kRjgSmu7c295ZIZjxP6bgoZZbWsKRicuLStRQ,4145
271
- datahub/ingestion/source/datahub/datahub_source.py,sha256=VKUtSRpwLAFatfru_pNy045HSA2z2DPzupQKIiX2uyE,8173
271
+ datahub/ingestion/source/datahub/datahub_source.py,sha256=2jDnsHEzpGhr00qQI9unSUJYD6Cb1McYFKOVbA-Zcm4,8487
272
272
  datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vBCU0XxGcZR6Xxs,940
273
273
  datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
274
274
  datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -398,7 +398,7 @@ datahub/ingestion/source/s3/config.py,sha256=Zs1nrBZKLImteZreIcSMMRLj8vBGgxakNDs
398
398
  datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=k7S9Xcmgr3-CvWrd5NEX-V8JSrcAwkm7vbHPTVZicow,3620
399
399
  datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
400
400
  datahub/ingestion/source/s3/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm0cdKD-Xgw,542
401
- datahub/ingestion/source/s3/source.py,sha256=OGc12oNWoXGVeIbSKzYlc7Qy3UeEmQ5vIOm-sG8fJxg,47396
401
+ datahub/ingestion/source/s3/source.py,sha256=8O_vu1J91h7owQlYyK27AZAQHxKsDpNC_jsLNpMed98,47336
402
402
  datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
403
403
  datahub/ingestion/source/sac/sac.py,sha256=zPSO9ukuyhvNaaVzeAYpA-_sFma_XMcCQMPaGvDWuTk,30226
404
404
  datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
@@ -486,9 +486,11 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
486
486
  datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
487
487
  datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
488
488
  datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
489
- datahub/ingestion/source/tableau/tableau.py,sha256=2M0d4IYn0kcMFlQ2yAvPRnXKZcj_xcqvEJik7QDnebI,136605
490
- datahub/ingestion/source/tableau/tableau_common.py,sha256=WugmFZvLgrHjvhUVBBZGRXiBJcsh2qcZK2TnWo5UQEA,26007
491
- datahub/ingestion/source/tableau/tableau_constant.py,sha256=nWElhtDo5kj5mWivZFmtVF_4Ugw0-EatBYWyDVzu5hE,2501
489
+ datahub/ingestion/source/tableau/tableau.py,sha256=P_DUuUvXk5u2ihA0JghtRkYc_KI_yQR2ZiQVe9IUvsU,138197
490
+ datahub/ingestion/source/tableau/tableau_common.py,sha256=9gQLq_3BlAsKll83uVlnWJRWaIDtFtREUyuimXF13Z0,26219
491
+ datahub/ingestion/source/tableau/tableau_constant.py,sha256=jVQMgLXND5aPL6XLETKp81BehRkvyLTU_Vhhe_1NOkI,2576
492
+ datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=PEGfcoUcBdsnOa5EzCqy1IiuQ3OZ9fZVEMzDqhhHOto,922
493
+ datahub/ingestion/source/tableau/tableau_validation.py,sha256=l0DuXUuxJwEXMzo61xLx-KLc5u6tiz2n0e9EepJdWEM,1808
492
494
  datahub/ingestion/source/unity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
493
495
  datahub/ingestion/source/unity/analyze_profiler.py,sha256=2pqkFY30CfN4aHgFZZntjeG0hNhBytZJvXC13VfTc1I,4689
494
496
  datahub/ingestion/source/unity/config.py,sha256=m4-n7mYz4Ct4L1QdfJFklwHyj8boKCbV7Sb3Ou6AT3Q,14756
@@ -882,7 +884,7 @@ datahub/testing/__init__.py,sha256=TywIuzGQvzJsNhI_PGD1RFk11M3RtGl9jIMtAVVHIkg,2
882
884
  datahub/testing/check_imports.py,sha256=EKuJmgUA46uOrlaOy0fCvPB7j9POkpJ0ExhO_pT3YAk,1356
883
885
  datahub/testing/check_sql_parser_result.py,sha256=f7U7IUSbfV4VACdNI857wPZ9tAZ9j6mXiXmcJNT_RzM,2671
884
886
  datahub/testing/check_str_enum.py,sha256=yqk0XXHOGteN-IGqCp5JHy0Kca13BnI09ZqKc4Nwl3E,1187
885
- datahub/testing/compare_metadata_json.py,sha256=EzIPHtRL00a1PSdaA82LU0oRo85GqjF7_jjWG_NwfW8,5274
887
+ datahub/testing/compare_metadata_json.py,sha256=pVJB2qLoKzEJLBXqFT-qGrxpA1y76y-mIbvJf0NnAD0,5274
886
888
  datahub/testing/docker_utils.py,sha256=g169iy_jNR_mg0p8X31cChZqjOryutAIHUYLq3xqueY,2415
887
889
  datahub/testing/doctest.py,sha256=1_8WEhHZ2eRQtw8vsXKzr9L5zzvs0Tcr6q4mnkyyvtw,295
888
890
  datahub/testing/mcp_diff.py,sha256=_sBFhmclYXJGQ_JYDrvKWXNGXt9ACvqeQvFaZrRHa8Q,10729
@@ -900,7 +902,7 @@ datahub/utilities/dedup_list.py,sha256=dUSpe1AajfuwlHVJKNv-CzDXSCkaw0HgSMOsxqUkQ
900
902
  datahub/utilities/delayed_iter.py,sha256=XlsI0DCXkVVejFKOW_uMT0E8DTqqOHQN3Ooak4EcULE,645
901
903
  datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,1128
902
904
  datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
903
- datahub/utilities/file_backed_collections.py,sha256=tS6hN0kxMaaTb8rB-vDqAd973mTKshERSC55JIe_3Cw,20557
905
+ datahub/utilities/file_backed_collections.py,sha256=I2GxSYtVzfo38pQpv2FyoBeWISiKD4zUi0t34jPCNrU,21957
904
906
  datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
905
907
  datahub/utilities/hive_schema_to_avro.py,sha256=2-7NI9haCAYbyUmHTb-QPxjn4WbmnDDKAIGmHJ11-1E,11622
906
908
  datahub/utilities/is_pytest.py,sha256=2m9T4S9IIKhI5RfTqrB2ZmumzHocdxBHpM1HroWj2XQ,138
@@ -974,8 +976,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
974
976
  datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
975
977
  datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
976
978
  datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
977
- acryl_datahub-0.15.0rc21.dist-info/METADATA,sha256=e3Tw7Cix7Z1uR8zyUtppjUv0ztJa2Kga0yl7nwPMbF8,173559
978
- acryl_datahub-0.15.0rc21.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
979
- acryl_datahub-0.15.0rc21.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
980
- acryl_datahub-0.15.0rc21.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
981
- acryl_datahub-0.15.0rc21.dist-info/RECORD,,
979
+ acryl_datahub-0.15.0rc22.dist-info/METADATA,sha256=48jbXm5fKitlO7rhjtNA1FcJT9Y7ypQ25EtatHbSeqY,173559
980
+ acryl_datahub-0.15.0rc22.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
981
+ acryl_datahub-0.15.0rc22.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
982
+ acryl_datahub-0.15.0rc22.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
983
+ acryl_datahub-0.15.0rc22.dist-info/RECORD,,
datahub/__init__.py CHANGED
@@ -3,7 +3,7 @@ import warnings
3
3
 
4
4
  # Published at https://pypi.org/project/acryl-datahub/.
5
5
  __package_name__ = "acryl-datahub"
6
- __version__ = "0.15.0rc21"
6
+ __version__ = "0.15.0rc22"
7
7
 
8
8
 
9
9
  def is_dev_mode() -> bool:
datahub/cli/ingest_cli.py CHANGED
@@ -27,6 +27,7 @@ from datahub.utilities.perf_timer import PerfTimer
27
27
 
28
28
  logger = logging.getLogger(__name__)
29
29
 
30
+ INGEST_SRC_TABLE_COLUMNS = ["runId", "source", "startTime", "status", "URN"]
30
31
  RUNS_TABLE_COLUMNS = ["runId", "rows", "created at"]
31
32
  RUN_TABLE_COLUMNS = ["urn", "aspect name", "created at"]
32
33
 
@@ -437,6 +438,115 @@ def mcps(path: str) -> None:
437
438
  sys.exit(ret)
438
439
 
439
440
 
441
+ @ingest.command()
442
+ @click.argument("page_offset", type=int, default=0)
443
+ @click.argument("page_size", type=int, default=100)
444
+ @click.option("--urn", type=str, default=None, help="Filter by ingestion source URN.")
445
+ @click.option(
446
+ "--source", type=str, default=None, help="Filter by ingestion source name."
447
+ )
448
+ @upgrade.check_upgrade
449
+ @telemetry.with_telemetry()
450
+ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) -> None:
451
+ """List ingestion source runs with their details, optionally filtered by URN or source."""
452
+
453
+ query = """
454
+ query listIngestionRuns($input: ListIngestionSourcesInput!) {
455
+ listIngestionSources(input: $input) {
456
+ ingestionSources {
457
+ urn
458
+ name
459
+ executions {
460
+ executionRequests {
461
+ id
462
+ result {
463
+ startTimeMs
464
+ status
465
+ }
466
+ }
467
+ }
468
+ }
469
+ }
470
+ }
471
+ """
472
+
473
+ # filter by urn and/or source using CONTAINS
474
+ filters = []
475
+ if urn:
476
+ filters.append({"field": "urn", "values": [urn], "condition": "CONTAIN"})
477
+ if source:
478
+ filters.append({"field": "name", "values": [source], "condition": "CONTAIN"})
479
+
480
+ variables = {
481
+ "input": {
482
+ "start": page_offset,
483
+ "count": page_size,
484
+ "filters": filters,
485
+ }
486
+ }
487
+
488
+ client = get_default_graph()
489
+ session = client._session
490
+ gms_host = client.config.server
491
+
492
+ url = f"{gms_host}/api/graphql"
493
+ try:
494
+ response = session.post(url, json={"query": query, "variables": variables})
495
+ response.raise_for_status()
496
+ except Exception as e:
497
+ click.echo(f"Error fetching data: {str(e)}")
498
+ return
499
+
500
+ try:
501
+ data = response.json()
502
+ except ValueError:
503
+ click.echo("Failed to parse JSON response from server.")
504
+ return
505
+
506
+ if not data:
507
+ click.echo("No response received from the server.")
508
+ return
509
+
510
+ # when urn or source filter does not match, exit gracefully
511
+ if (
512
+ not isinstance(data.get("data"), dict)
513
+ or "listIngestionSources" not in data["data"]
514
+ ):
515
+ click.echo("No matching ingestion sources found. Please check your filters.")
516
+ return
517
+
518
+ ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"]
519
+ if not ingestion_sources:
520
+ click.echo("No ingestion sources or executions found.")
521
+ return
522
+
523
+ rows = []
524
+ for ingestion_source in ingestion_sources:
525
+ urn = ingestion_source.get("urn", "N/A")
526
+ name = ingestion_source.get("name", "N/A")
527
+
528
+ executions = ingestion_source.get("executions", {}).get("executionRequests", [])
529
+ for execution in executions:
530
+ execution_id = execution.get("id", "N/A")
531
+ start_time = execution.get("result", {}).get("startTimeMs", "N/A")
532
+ start_time = (
533
+ datetime.fromtimestamp(start_time / 1000).strftime("%Y-%m-%d %H:%M:%S")
534
+ if start_time != "N/A"
535
+ else "N/A"
536
+ )
537
+ status = execution.get("result", {}).get("status", "N/A")
538
+
539
+ rows.append([execution_id, name, start_time, status, urn])
540
+
541
+ click.echo(
542
+ tabulate(
543
+ rows,
544
+ headers=INGEST_SRC_TABLE_COLUMNS,
545
+ tablefmt="grid",
546
+ )
547
+ )
548
+
549
+
440
550
  @ingest.command()
441
551
  @click.argument("page_offset", type=int, default=0)
442
552
  @click.argument("page_size", type=int, default=100)
@@ -147,6 +147,47 @@ class DataHubDatabaseReader:
147
147
  version
148
148
  """
149
149
 
150
+ def execute_server_cursor(
151
+ self, query: str, params: Dict[str, Any]
152
+ ) -> Iterable[Dict[str, Any]]:
153
+ with self.engine.connect() as conn:
154
+ if self.engine.dialect.name == "postgresql":
155
+ with conn.begin(): # Transaction required for PostgreSQL server-side cursor
156
+ conn = conn.execution_options(
157
+ stream_results=True,
158
+ yield_per=self.config.database_query_batch_size,
159
+ )
160
+ result = conn.execute(query, params)
161
+ for row in result:
162
+ yield dict(row)
163
+ elif self.engine.dialect.name == "mysql": # MySQL
164
+ import MySQLdb
165
+
166
+ with contextlib.closing(
167
+ conn.connection.cursor(MySQLdb.cursors.SSCursor)
168
+ ) as cursor:
169
+ logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
170
+ cursor.execute(query, params)
171
+
172
+ columns = [desc[0] for desc in cursor.description]
173
+ while True:
174
+ rows = cursor.fetchmany(self.config.database_query_batch_size)
175
+ if not rows:
176
+ break # Use break instead of return in generator
177
+ for row in rows:
178
+ yield dict(zip(columns, row))
179
+ else:
180
+ raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
181
+
182
+ def _get_rows(
183
+ self, from_createdon: datetime, stop_time: datetime
184
+ ) -> Iterable[Dict[str, Any]]:
185
+ params = {
186
+ "exclude_aspects": list(self.config.exclude_aspects),
187
+ "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
188
+ }
189
+ yield from self.execute_server_cursor(self.query, params)
190
+
150
191
  def get_aspects(
151
192
  self, from_createdon: datetime, stop_time: datetime
152
193
  ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
@@ -159,27 +200,6 @@ class DataHubDatabaseReader:
159
200
  if mcp:
160
201
  yield mcp, row["createdon"]
161
202
 
162
- def _get_rows(
163
- self, from_createdon: datetime, stop_time: datetime
164
- ) -> Iterable[Dict[str, Any]]:
165
- with self.engine.connect() as conn:
166
- with contextlib.closing(conn.connection.cursor()) as cursor:
167
- cursor.execute(
168
- self.query,
169
- {
170
- "exclude_aspects": list(self.config.exclude_aspects),
171
- "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
172
- },
173
- )
174
-
175
- columns = [desc[0] for desc in cursor.description]
176
- while True:
177
- rows = cursor.fetchmany(self.config.database_query_batch_size)
178
- if not rows:
179
- return
180
- for row in rows:
181
- yield dict(zip(columns, row))
182
-
183
203
  def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
184
204
  """
185
205
  Fetches all soft-deleted entities from the database.
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from datetime import datetime, timezone
2
+ from datetime import datetime, timedelta, timezone
3
3
  from functools import partial
4
4
  from typing import Dict, Iterable, List, Optional
5
5
 
@@ -26,6 +26,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
26
26
  StatefulIngestionSourceBase,
27
27
  )
28
28
  from datahub.metadata.schema_classes import ChangeTypeClass
29
+ from datahub.utilities.progress_timer import ProgressTimer
29
30
 
30
31
  logger = logging.getLogger(__name__)
31
32
 
@@ -105,11 +106,17 @@ class DataHubSource(StatefulIngestionSourceBase):
105
106
  self, from_createdon: datetime, reader: DataHubDatabaseReader
106
107
  ) -> Iterable[MetadataWorkUnit]:
107
108
  logger.info(f"Fetching database aspects starting from {from_createdon}")
109
+ progress = ProgressTimer(report_every=timedelta(seconds=60))
108
110
  mcps = reader.get_aspects(from_createdon, self.report.stop_time)
109
111
  for i, (mcp, createdon) in enumerate(mcps):
110
112
  if not self.urn_pattern.allowed(str(mcp.entityUrn)):
111
113
  continue
112
114
 
115
+ if progress.should_report():
116
+ logger.info(
117
+ f"Ingested {i} database aspects so far, currently at {createdon}"
118
+ )
119
+
113
120
  yield mcp.as_workunit()
114
121
  self.report.num_database_aspects_ingested += 1
115
122
 
@@ -9,6 +9,7 @@ from datetime import datetime
9
9
  from itertools import groupby
10
10
  from pathlib import PurePath
11
11
  from typing import Any, Dict, Iterable, List, Optional, Tuple
12
+ from urllib.parse import urlparse
12
13
 
13
14
  import smart_open.compression as so_compression
14
15
  from more_itertools import peekable
@@ -993,9 +994,7 @@ class S3Source(StatefulIngestionSourceBase):
993
994
  folders = []
994
995
  for dir in dirs_to_process:
995
996
  logger.info(f"Getting files from folder: {dir}")
996
- prefix_to_process = dir.rstrip("\\").lstrip(
997
- self.create_s3_path(bucket_name, "/")
998
- )
997
+ prefix_to_process = urlparse(dir).path.lstrip("/")
999
998
 
1000
999
  folders.extend(
1001
1000
  self.get_folder_info(
@@ -111,6 +111,8 @@ from datahub.ingestion.source.tableau.tableau_common import (
111
111
  tableau_field_to_schema_field,
112
112
  workbook_graphql_query,
113
113
  )
114
+ from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
115
+ from datahub.ingestion.source.tableau.tableau_validation import check_user_role
114
116
  from datahub.metadata.com.linkedin.pegasus2avro.common import (
115
117
  AuditStamp,
116
118
  ChangeAuditStamps,
@@ -167,7 +169,7 @@ from datahub.utilities.urns.dataset_urn import DatasetUrn
167
169
 
168
170
  try:
169
171
  # On earlier versions of the tableauserverclient, the NonXMLResponseError
170
- # was thrown when reauthentication was needed. We'll keep both exceptions
172
+ # was thrown when reauthentication was necessary. We'll keep both exceptions
171
173
  # around for now, but can remove this in the future.
172
174
  from tableauserverclient.server.endpoint.exceptions import ( # type: ignore
173
175
  NotSignedInError,
@@ -632,6 +634,33 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
632
634
  num_upstream_table_lineage_failed_parse_sql: int = 0
633
635
  num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
634
636
  num_hidden_assets_skipped: int = 0
637
+ logged_in_user: List[UserInfo] = []
638
+
639
+
640
+ def report_user_role(report: TableauSourceReport, server: Server) -> None:
641
+ title: str = "Insufficient Permissions"
642
+ message: str = "The user must have the `Site Administrator Explorer` role to perform metadata ingestion."
643
+ try:
644
+ # TableauSiteSource instance is per site, so each time we need to find-out user detail
645
+ # the site-role might be different on another site
646
+ logged_in_user: UserInfo = UserInfo.from_server(server=server)
647
+
648
+ if not logged_in_user.is_site_administrator_explorer():
649
+ report.warning(
650
+ title=title,
651
+ message=message,
652
+ context=f"user-name={logged_in_user.user_name}, role={logged_in_user.site_role}, site_id={logged_in_user.site_id}",
653
+ )
654
+
655
+ report.logged_in_user.append(logged_in_user)
656
+
657
+ except Exception as e:
658
+ report.warning(
659
+ title=title,
660
+ message="Failed to verify the user's role. The user must have `Site Administrator Explorer` role.",
661
+ context=f"{e}",
662
+ exc=e,
663
+ )
635
664
 
636
665
 
637
666
  @platform_name("Tableau")
@@ -676,6 +705,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
676
705
  try:
677
706
  logger.info(f"Authenticated to Tableau site: '{site_content_url}'")
678
707
  self.server = self.config.make_tableau_client(site_content_url)
708
+ report_user_role(report=self.report, server=self.server)
679
709
  # Note that we're not catching ConfigurationError, since we want that to throw.
680
710
  except ValueError as e:
681
711
  self.report.failure(
@@ -689,9 +719,17 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
689
719
  test_report = TestConnectionReport()
690
720
  try:
691
721
  source_config = TableauConfig.parse_obj_allow_extras(config_dict)
692
- source_config.make_tableau_client(source_config.site)
722
+
723
+ server = source_config.make_tableau_client(source_config.site)
724
+
693
725
  test_report.basic_connectivity = CapabilityReport(capable=True)
726
+
727
+ test_report.capability_report = check_user_role(
728
+ logged_in_user=UserInfo.from_server(server=server)
729
+ )
730
+
694
731
  except Exception as e:
732
+ logger.warning(f"{e}", exc_info=e)
695
733
  test_report.basic_connectivity = CapabilityReport(
696
734
  capable=False, failure_reason=str(e)
697
735
  )
@@ -831,6 +869,8 @@ class TableauSiteSource:
831
869
  # when emitting custom SQL data sources.
832
870
  self.custom_sql_ids_being_used: List[str] = []
833
871
 
872
+ report_user_role(report=report, server=server)
873
+
834
874
  @property
835
875
  def no_env_browse_prefix(self) -> str:
836
876
  # Prefix to use with browse path (v1)
@@ -1290,7 +1330,6 @@ class TableauSiteSource:
1290
1330
  page_size = page_size_override or self.config.page_size
1291
1331
 
1292
1332
  filter_pages = get_filter_pages(query_filter, page_size)
1293
-
1294
1333
  for filter_page in filter_pages:
1295
1334
  has_next_page = 1
1296
1335
  current_cursor: Optional[str] = None
@@ -975,15 +975,22 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
975
975
  # a few ten thousand, then tableau server responds with empty response
976
976
  # causing below error:
977
977
  # tableauserverclient.server.endpoint.exceptions.NonXMLResponseError: b''
978
+
979
+ # in practice, we only do pagination if len(query_filter.keys()) == 1
980
+ if len(query_filter.keys()) != 1:
981
+ return filter_pages
982
+
983
+ current_key = (list(query_filter.keys()))[0]
984
+
978
985
  if (
979
- len(query_filter.keys()) == 1
980
- and query_filter.get(c.ID_WITH_IN)
981
- and isinstance(query_filter[c.ID_WITH_IN], list)
986
+ current_key in [c.ID_WITH_IN, c.PROJECT_NAME_WITH_IN]
987
+ and query_filter.get(current_key)
988
+ and isinstance(query_filter[current_key], list)
982
989
  ):
983
- ids = query_filter[c.ID_WITH_IN]
990
+ ids = query_filter[current_key]
984
991
  filter_pages = [
985
992
  {
986
- c.ID_WITH_IN: ids[
993
+ current_key: ids[
987
994
  start : (
988
995
  start + page_size if start + page_size < len(ids) else len(ids)
989
996
  )
@@ -81,3 +81,5 @@ EMBEDDED_DATA_SOURCES_CONNECTION = "embeddedDatasourcesConnection"
81
81
  PROJECT = "Project"
82
82
  SITE = "Site"
83
83
  IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
84
+ SITE_PERMISSION = "sitePermission"
85
+ SITE_ROLE = "SiteAdministratorExplorer"
@@ -0,0 +1,33 @@
1
+ from dataclasses import dataclass
2
+
3
+ from tableauserverclient import Server, UserItem
4
+
5
+ from datahub.ingestion.source.tableau import tableau_constant as c
6
+
7
+
8
+ @dataclass
9
+ class UserInfo:
10
+ user_name: str
11
+ site_role: str
12
+ site_id: str
13
+
14
+ def is_site_administrator_explorer(self):
15
+ return self.site_role == c.SITE_ROLE
16
+
17
+ @staticmethod
18
+ def from_server(server: Server) -> "UserInfo":
19
+ assert server.user_id, "make the connection with tableau"
20
+
21
+ user: UserItem = server.users.get_by_id(server.user_id)
22
+
23
+ assert user.site_role, "site_role is not available" # to silent the lint
24
+
25
+ assert user.name, "user name is not available" # to silent the lint
26
+
27
+ assert server.site_id, "site identifier is not available" # to silent the lint
28
+
29
+ return UserInfo(
30
+ user_name=user.name,
31
+ site_role=user.site_role,
32
+ site_id=server.site_id,
33
+ )
@@ -0,0 +1,48 @@
1
+ import logging
2
+ from typing import Dict, Union
3
+
4
+ from datahub.ingestion.api.source import CapabilityReport, SourceCapability
5
+ from datahub.ingestion.source.tableau import tableau_constant as c
6
+ from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ def check_user_role(
12
+ logged_in_user: UserInfo,
13
+ ) -> Dict[Union[SourceCapability, str], CapabilityReport]:
14
+ capability_dict: Dict[Union[SourceCapability, str], CapabilityReport] = {
15
+ c.SITE_PERMISSION: CapabilityReport(
16
+ capable=True,
17
+ )
18
+ }
19
+
20
+ failure_reason: str = (
21
+ "The user does not have the `Site Administrator Explorer` role."
22
+ )
23
+
24
+ mitigation_message_prefix: str = (
25
+ "Assign `Site Administrator Explorer` role to the user"
26
+ )
27
+ mitigation_message_suffix: str = "Refer to the setup guide: https://datahubproject.io/docs/quick-ingestion-guides/tableau/setup"
28
+
29
+ try:
30
+ # TODO: Add check for `Enable Derived Permissions`
31
+ if not logged_in_user.is_site_administrator_explorer():
32
+ capability_dict[c.SITE_PERMISSION] = CapabilityReport(
33
+ capable=False,
34
+ failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.",
35
+ mitigation_message=f"{mitigation_message_prefix} `{logged_in_user.user_name}`. {mitigation_message_suffix}",
36
+ )
37
+
38
+ return capability_dict
39
+
40
+ except Exception as e:
41
+ logger.warning(msg=e, exc_info=e)
42
+ capability_dict[c.SITE_PERMISSION] = CapabilityReport(
43
+ capable=False,
44
+ failure_reason="Failed to verify user role.",
45
+ mitigation_message=f"{mitigation_message_prefix}. {mitigation_message_suffix}", # user is unknown
46
+ )
47
+
48
+ return capability_dict
@@ -117,7 +117,7 @@ def diff_metadata_json(
117
117
  ignore_paths: Sequence[str] = (),
118
118
  ignore_order: bool = True,
119
119
  ) -> Union[DeepDiff, MCPDiff]:
120
- ignore_paths = (*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info")
120
+ ignore_paths = [*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info"]
121
121
  try:
122
122
  if ignore_order:
123
123
  golden_map = get_aspects_by_urn(golden)
@@ -1,6 +1,7 @@
1
1
  import collections
2
2
  import gzip
3
3
  import logging
4
+ import os
4
5
  import pathlib
5
6
  import pickle
6
7
  import shutil
@@ -33,6 +34,14 @@ from datahub.ingestion.api.closeable import Closeable
33
34
 
34
35
  logger: logging.Logger = logging.getLogger(__name__)
35
36
 
37
+ OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR = (
38
+ os.environ.get("OVERRIDE_SQLITE_VERSION_REQ") or ""
39
+ )
40
+ OVERRIDE_SQLITE_VERSION_REQUIREMENT = (
41
+ OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR
42
+ and OVERRIDE_SQLITE_VERSION_REQUIREMENT_STR.lower() != "false"
43
+ )
44
+
36
45
  _DEFAULT_FILE_NAME = "sqlite.db"
37
46
  _DEFAULT_TABLE_NAME = "data"
38
47
 
@@ -212,6 +221,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
212
221
  _active_object_cache: OrderedDict[str, Tuple[_VT, bool]] = field(
213
222
  init=False, repr=False
214
223
  )
224
+ _use_sqlite_on_conflict: bool = field(repr=False, default=True)
215
225
 
216
226
  def __post_init__(self) -> None:
217
227
  assert (
@@ -232,7 +242,10 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
232
242
  # We use the ON CONFLICT clause to implement UPSERTs with sqlite.
233
243
  # This was added in 3.24.0 from 2018-06-04.
234
244
  # See https://www.sqlite.org/lang_conflict.html
235
- raise RuntimeError("SQLite version 3.24.0 or later is required")
245
+ if OVERRIDE_SQLITE_VERSION_REQUIREMENT:
246
+ self.use_sqlite_on_conflict = False
247
+ else:
248
+ raise RuntimeError("SQLite version 3.24.0 or later is required")
236
249
 
237
250
  # We keep a small cache in memory to avoid having to serialize/deserialize
238
251
  # data from the database too often. We use an OrderedDict to build
@@ -295,7 +308,7 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
295
308
  values.append(column_serializer(value))
296
309
  items_to_write.append(tuple(values))
297
310
 
298
- if items_to_write:
311
+ if items_to_write and self._use_sqlite_on_conflict:
299
312
  # Tricky: By using a INSERT INTO ... ON CONFLICT (key) structure, we can
300
313
  # ensure that the rowid remains the same if a value is updated but is
301
314
  # autoincremented when rows are inserted.
@@ -312,6 +325,26 @@ class FileBackedDict(MutableMapping[str, _VT], Closeable, Generic[_VT]):
312
325
  """,
313
326
  items_to_write,
314
327
  )
328
+ else:
329
+ for item in items_to_write:
330
+ try:
331
+ self._conn.execute(
332
+ f"""INSERT INTO {self.tablename} (
333
+ key,
334
+ value
335
+ {''.join(f', {column_name}' for column_name in self.extra_columns.keys())}
336
+ )
337
+ VALUES ({', '.join(['?'] *(2 + len(self.extra_columns)))})""",
338
+ item,
339
+ )
340
+ except sqlite3.IntegrityError:
341
+ self._conn.execute(
342
+ f"""UPDATE {self.tablename} SET
343
+ value = ?
344
+ {''.join(f', {column_name} = ?' for column_name in self.extra_columns.keys())}
345
+ WHERE key = ?""",
346
+ (*item[1:], item[0]),
347
+ )
315
348
 
316
349
  def flush(self) -> None:
317
350
  self._prune_cache(len(self._active_object_cache))