PyPI - acryl-datahub - Versions diffs - 0.15.0rc21__py3-none-any.whl → 0.15.0rc23__py3-none-any.whl - Mend

acryl-datahub 0.15.0rc21py3-none-any.whl → 0.15.0rc23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (19) hide show

{acryl_datahub-0.15.0rc21.dist-info → acryl_datahub-0.15.0rc23.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-datahub/__init__.py,sha256=caUPlyD6P05EsMKzRYtlTS611d82sT4szr8_WAu_rJ4,575
+datahub/__init__.py,sha256=eOmo10Qg3UHdXM-mhXsProWUviox9Ng9kfUMS-B8xpo,575
 datahub/__main__.py,sha256=pegIvQ9hzK7IhqVeUi1MeADSZ2QlP-D3K0OQdEg55RU,106
 datahub/entrypoints.py,sha256=3-qSfXAx3Z0FEkBV5tlO8fQr4xk4ySeDRMVTpS5Xd6A,7793
 datahub/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -67,7 +67,7 @@ datahub/cli/docker_cli.py,sha256=QGoWFp8ZZsXOSMbgu0Q4snMmMmtP3epWAN-fYglUNEc,364
 datahub/cli/env_utils.py,sha256=RQzjg4JE29hjPt4v7p-RuqoOr99w8E3DBHWiN2Sm7T4,252
 datahub/cli/exists_cli.py,sha256=IsuU86R-g7BJjAl1vULH6d-BWJHAKa4XHLZl5WxGUEM,1233
 datahub/cli/get_cli.py,sha256=VV80BCXfZ0-C8fr2k43SIuN9DB-fOYP9StWsTHnXwFw,2327
-datahub/cli/ingest_cli.py,sha256=miFXBUm9xD8vRvKPwpB-3GXKV1Abf8xtPWyxV6UeenM,18983
+datahub/cli/ingest_cli.py,sha256=nRoZvVpsGPXmEZCvSOBfsZ61Ep1fCqYRVp79RBnHSnI,22393
 datahub/cli/json_file.py,sha256=nWo-VVthaaW4Do1eUqgrzk0fShb29MjiKXvZVOTq76c,943
 datahub/cli/lite_cli.py,sha256=UmlMMquce6lHiPaKUBBT0XQtqR9SHEmrGlJyKV9YY60,13030
 datahub/cli/migrate.py,sha256=p42vixwKzi9OHQnIa0K2FxwGvt-1OxXeuYGJzfu5Sqo,17939
@@ -93,7 +93,7 @@ datahub/configuration/common.py,sha256=Ngj2-HKPEhCMbcx3phUqyoOHayhqWNt1t0e2hO3GQ
 datahub/configuration/config_loader.py,sha256=4V8rrbKvCbfEys2Tlw2uZXb3yC9Hpoubn2O8GXhGe3A,5785
 datahub/configuration/connection_resolver.py,sha256=n4-6MwMiOEDgTouxO0SMjTILKVhJPo6-naE6FuR5qMs,1516
 datahub/configuration/datetimes.py,sha256=nayNc0mmlVKH6oVv9ud6C1dDUiZPGabW-YZxvrkosPg,2870
-datahub/configuration/git.py,sha256=s55eUHxKqVZgtVsISaDyS-1F4iZBiybbjYsjbp5LU5o,6135
+datahub/configuration/git.py,sha256=q9iac6cc6oZ3RVSPTyuR2VMsmt2wr-uVaCLWohdKVV0,6461
 datahub/configuration/import_resolver.py,sha256=b4Ie9L7knN1LALEVMxTcNFSklDD6CVE-4Ipy4ZYhNYA,369
 datahub/configuration/json_loader.py,sha256=vIDnjwXWi9yHDO8KW64EupOzOb_sspehGCD7xGHzg84,302
 datahub/configuration/kafka.py,sha256=MlIwpd5FFyOyjdDXW_X9JTLNk7f988sPMgevkcZYVgI,2579
@@ -266,9 +266,9 @@ datahub/ingestion/source/data_lake_common/path_spec.py,sha256=u3u2eMe70V5vur-j8m
 datahub/ingestion/source/datahub/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/datahub/config.py,sha256=pOXt0b1PX6D7dtD4RuKwdmr6sQKnXSf6LHxfPUMhP8s,3658
 datahub/ingestion/source/datahub/datahub_api_reader.py,sha256=hlKADVEPoTFiRGKqRsMF5mL4fSu_IrIW8Nx7LpEzvkM,2134
-datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=TLH1KMyvRgiuENr8t0lnBjCxggONsDrxYThRzdNVEuE,8458
+datahub/ingestion/source/datahub/datahub_database_reader.py,sha256=F8JrOjSrmJ2B6m1MWh83A1EYFDcGMla749HUeQWMnL0,9464
 datahub/ingestion/source/datahub/datahub_kafka_reader.py,sha256=8x9_u5kRjgSmu7c295ZIZjxP6bgoZZbWsKRicuLStRQ,4145
-datahub/ingestion/source/datahub/datahub_source.py,sha256=VKUtSRpwLAFatfru_pNy045HSA2z2DPzupQKIiX2uyE,8173
+datahub/ingestion/source/datahub/datahub_source.py,sha256=2jDnsHEzpGhr00qQI9unSUJYD6Cb1McYFKOVbA-Zcm4,8487
 datahub/ingestion/source/datahub/report.py,sha256=VHBfCbwFRzdLdB7hQG9ST4EiZxl_vBCU0XxGcZR6Xxs,940
 datahub/ingestion/source/datahub/state.py,sha256=PZoT7sSK1wadVf5vN6phrgr7I6LL7ePP-EJjP1OO0bQ,3507
 datahub/ingestion/source/dbt/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -398,7 +398,7 @@ datahub/ingestion/source/s3/config.py,sha256=Zs1nrBZKLImteZreIcSMMRLj8vBGgxakNDs
 datahub/ingestion/source/s3/datalake_profiler_config.py,sha256=k7S9Xcmgr3-CvWrd5NEX-V8JSrcAwkm7vbHPTVZicow,3620
 datahub/ingestion/source/s3/profiling.py,sha256=yKNCKpr6w7qpCH-baeSkNE9VjkN6eBot_weD-2_Jxzk,17579
 datahub/ingestion/source/s3/report.py,sha256=fzkTdTewYlWrTk4f2Cyl-e8RV4qw9wEVtm0cdKD-Xgw,542
-datahub/ingestion/source/s3/source.py,sha256=OGc12oNWoXGVeIbSKzYlc7Qy3UeEmQ5vIOm-sG8fJxg,47396
+datahub/ingestion/source/s3/source.py,sha256=8O_vu1J91h7owQlYyK27AZAQHxKsDpNC_jsLNpMed98,47336
 datahub/ingestion/source/sac/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/sac/sac.py,sha256=zPSO9ukuyhvNaaVzeAYpA-_sFma_XMcCQMPaGvDWuTk,30226
 datahub/ingestion/source/sac/sac_common.py,sha256=-xQTDBtgH56AnpRXWGDnlmQqUuLRx-7wF1U1kQFWtX8,998
@@ -427,7 +427,7 @@ datahub/ingestion/source/snowflake/snowflake_assertion.py,sha256=_l3k4aI9wvioE81
 datahub/ingestion/source/snowflake/snowflake_config.py,sha256=LZqnTELtzRNf0vsKG-xXggXyt13S9RYvHOZEZHRjgNk,18851
 datahub/ingestion/source/snowflake/snowflake_connection.py,sha256=yzv-01FdmfDSCJY5rqKNNodXxzg3SS5DF7oA4WXArOA,17793
 datahub/ingestion/source/snowflake/snowflake_data_reader.py,sha256=ffR5E2uhD71FUMXd3XOg2rHwrp1rbbGEFTAbqKcmI2s,2195
-datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=EnTJoRIQKcZOIYfb_NUff_YA8IdIroaFD1JHUn-M6ok,23346
+datahub/ingestion/source/snowflake/snowflake_lineage_v2.py,sha256=suMICPFPvoV6shkjD_14JunLc8jAZBINzlFk2mYldkU,23676
 datahub/ingestion/source/snowflake/snowflake_profiler.py,sha256=0DJiSwII6FY34urlBja2FW66NaVvhbBWmG0p7u8Xyrc,7548
 datahub/ingestion/source/snowflake/snowflake_queries.py,sha256=fu-8S9eADIXZcd_kHc6cBeMa-on9RF9qG3yqjJnS3DE,26085
 datahub/ingestion/source/snowflake/snowflake_query.py,sha256=yDu_1aTAG7eLEh1w1FGmn2-c6NJZURdslnI6fC_4B_0,38723
@@ -486,9 +486,11 @@ datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider
 datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py,sha256=xsH7Ao_05VTjqpkzLkhdf5B1ULMzFoD8vkJJIJU9w-U,4077
 datahub/ingestion/source/state_provider/state_provider_registry.py,sha256=SVq4mIyGNmLXE9OZx1taOiNPqDoQp03-Ot9rYnB5F3k,401
 datahub/ingestion/source/tableau/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-datahub/ingestion/source/tableau/tableau.py,sha256=2M0d4IYn0kcMFlQ2yAvPRnXKZcj_xcqvEJik7QDnebI,136605
-datahub/ingestion/source/tableau/tableau_common.py,sha256=WugmFZvLgrHjvhUVBBZGRXiBJcsh2qcZK2TnWo5UQEA,26007
-datahub/ingestion/source/tableau/tableau_constant.py,sha256=nWElhtDo5kj5mWivZFmtVF_4Ugw0-EatBYWyDVzu5hE,2501
+datahub/ingestion/source/tableau/tableau.py,sha256=P_DUuUvXk5u2ihA0JghtRkYc_KI_yQR2ZiQVe9IUvsU,138197
+datahub/ingestion/source/tableau/tableau_common.py,sha256=9gQLq_3BlAsKll83uVlnWJRWaIDtFtREUyuimXF13Z0,26219
+datahub/ingestion/source/tableau/tableau_constant.py,sha256=jVQMgLXND5aPL6XLETKp81BehRkvyLTU_Vhhe_1NOkI,2576
+datahub/ingestion/source/tableau/tableau_server_wrapper.py,sha256=PEGfcoUcBdsnOa5EzCqy1IiuQ3OZ9fZVEMzDqhhHOto,922
+datahub/ingestion/source/tableau/tableau_validation.py,sha256=l0DuXUuxJwEXMzo61xLx-KLc5u6tiz2n0e9EepJdWEM,1808
 datahub/ingestion/source/unity/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datahub/ingestion/source/unity/analyze_profiler.py,sha256=2pqkFY30CfN4aHgFZZntjeG0hNhBytZJvXC13VfTc1I,4689
 datahub/ingestion/source/unity/config.py,sha256=m4-n7mYz4Ct4L1QdfJFklwHyj8boKCbV7Sb3Ou6AT3Q,14756
@@ -882,7 +884,7 @@ datahub/testing/__init__.py,sha256=TywIuzGQvzJsNhI_PGD1RFk11M3RtGl9jIMtAVVHIkg,2
 datahub/testing/check_imports.py,sha256=EKuJmgUA46uOrlaOy0fCvPB7j9POkpJ0ExhO_pT3YAk,1356
 datahub/testing/check_sql_parser_result.py,sha256=f7U7IUSbfV4VACdNI857wPZ9tAZ9j6mXiXmcJNT_RzM,2671
 datahub/testing/check_str_enum.py,sha256=yqk0XXHOGteN-IGqCp5JHy0Kca13BnI09ZqKc4Nwl3E,1187
-datahub/testing/compare_metadata_json.py,sha256=EzIPHtRL00a1PSdaA82LU0oRo85GqjF7_jjWG_NwfW8,5274
+datahub/testing/compare_metadata_json.py,sha256=pVJB2qLoKzEJLBXqFT-qGrxpA1y76y-mIbvJf0NnAD0,5274
 datahub/testing/docker_utils.py,sha256=g169iy_jNR_mg0p8X31cChZqjOryutAIHUYLq3xqueY,2415
 datahub/testing/doctest.py,sha256=1_8WEhHZ2eRQtw8vsXKzr9L5zzvs0Tcr6q4mnkyyvtw,295
 datahub/testing/mcp_diff.py,sha256=_sBFhmclYXJGQ_JYDrvKWXNGXt9ACvqeQvFaZrRHa8Q,10729
@@ -900,7 +902,7 @@ datahub/utilities/dedup_list.py,sha256=dUSpe1AajfuwlHVJKNv-CzDXSCkaw0HgSMOsxqUkQ
 datahub/utilities/delayed_iter.py,sha256=XlsI0DCXkVVejFKOW_uMT0E8DTqqOHQN3Ooak4EcULE,645
 datahub/utilities/delta.py,sha256=hkpF8W7Lvg2gUJBQR3mmIzOxsRQ6i5cchRPFlAVoV10,1128
 datahub/utilities/docs_build.py,sha256=uFMK3z1d4BExpsrvguHunidbEDAzQ8hoOP7iQ0A_IVw,211
-datahub/utilities/file_backed_collections.py,sha256=tS6hN0kxMaaTb8rB-vDqAd973mTKshERSC55JIe_3Cw,20557
+datahub/utilities/file_backed_collections.py,sha256=I2GxSYtVzfo38pQpv2FyoBeWISiKD4zUi0t34jPCNrU,21957
 datahub/utilities/global_warning_util.py,sha256=adrEl3WhetQ-bymrPINjd976ZFndhbvk3QosUYGsos8,261
 datahub/utilities/hive_schema_to_avro.py,sha256=2-7NI9haCAYbyUmHTb-QPxjn4WbmnDDKAIGmHJ11-1E,11622
 datahub/utilities/is_pytest.py,sha256=2m9T4S9IIKhI5RfTqrB2ZmumzHocdxBHpM1HroWj2XQ,138
@@ -974,8 +976,8 @@ datahub_provider/operators/datahub_assertion_operator.py,sha256=uvTQ-jk2F0sbqqxp
 datahub_provider/operators/datahub_assertion_sensor.py,sha256=lCBj_3x1cf5GMNpHdfkpHuyHfVxsm6ff5x2Z5iizcAo,140
 datahub_provider/operators/datahub_operation_operator.py,sha256=aevDp2FzX7FxGlXrR0khoHNbxbhKR2qPEX5e8O2Jyzw,174
 datahub_provider/operators/datahub_operation_sensor.py,sha256=8fcdVBCEPgqy1etTXgLoiHoJrRt_nzFZQMdSzHqSG7M,168
-acryl_datahub-0.15.0rc21.dist-info/METADATA,sha256=e3Tw7Cix7Z1uR8zyUtppjUv0ztJa2Kga0yl7nwPMbF8,173559
-acryl_datahub-0.15.0rc21.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-acryl_datahub-0.15.0rc21.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
-acryl_datahub-0.15.0rc21.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
-acryl_datahub-0.15.0rc21.dist-info/RECORD,,
+acryl_datahub-0.15.0rc23.dist-info/METADATA,sha256=cPlJko8JF1pZEIihXKsAct2ai4okUHAMu8e3sAha7mU,173559
+acryl_datahub-0.15.0rc23.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+acryl_datahub-0.15.0rc23.dist-info/entry_points.txt,sha256=Yj0PWB0LQOq4Rj2fyR6ETx4BUGw4TOcNL0ZNoAZ9kQg,9504
+acryl_datahub-0.15.0rc23.dist-info/top_level.txt,sha256=iLjSrLK5ox1YVYcglRUkcvfZPvKlobBWx7CTUXx8_GI,25
+acryl_datahub-0.15.0rc23.dist-info/RECORD,,

datahub/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ import warnings
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "0.15.0rc21"
+__version__ = "0.15.0rc23"
 def is_dev_mode() -> bool:

datahub/cli/ingest_cli.py CHANGED Viewed

@@ -27,6 +27,7 @@ from datahub.utilities.perf_timer import PerfTimer
 logger = logging.getLogger(__name__)
+INGEST_SRC_TABLE_COLUMNS = ["runId", "source", "startTime", "status", "URN"]
 RUNS_TABLE_COLUMNS = ["runId", "rows", "created at"]
 RUN_TABLE_COLUMNS = ["urn", "aspect name", "created at"]
@@ -437,6 +438,115 @@ def mcps(path: str) -> None:
     sys.exit(ret)
+@ingest.command()
+@click.argument("page_offset", type=int, default=0)
+@click.argument("page_size", type=int, default=100)
+@click.option("--urn", type=str, default=None, help="Filter by ingestion source URN.")
+@click.option(
+    "--source", type=str, default=None, help="Filter by ingestion source name."
+)
+@upgrade.check_upgrade
+@telemetry.with_telemetry()
+def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) -> None:
+    """List ingestion source runs with their details, optionally filtered by URN or source."""
+    query = """
+    query listIngestionRuns($input: ListIngestionSourcesInput!) {
+      listIngestionSources(input: $input) {
+        ingestionSources {
+          urn
+          name
+          executions {
+            executionRequests {
+              id
+              result {
+                startTimeMs
+                status
+              }
+            }
+          }
+        }
+      }
+    }
+    """
+    # filter by urn and/or source using CONTAINS
+    filters = []
+    if urn:
+        filters.append({"field": "urn", "values": [urn], "condition": "CONTAIN"})
+    if source:
+        filters.append({"field": "name", "values": [source], "condition": "CONTAIN"})
+    variables = {
+        "input": {
+            "start": page_offset,
+            "count": page_size,
+            "filters": filters,
+        }
+    }
+    client = get_default_graph()
+    session = client._session
+    gms_host = client.config.server
+    url = f"{gms_host}/api/graphql"
+    try:
+        response = session.post(url, json={"query": query, "variables": variables})
+        response.raise_for_status()
+    except Exception as e:
+        click.echo(f"Error fetching data: {str(e)}")
+        return
+    try:
+        data = response.json()
+    except ValueError:
+        click.echo("Failed to parse JSON response from server.")
+        return
+    if not data:
+        click.echo("No response received from the server.")
+        return
+    # when urn or source filter does not match, exit gracefully
+    if (
+        not isinstance(data.get("data"), dict)
+        or "listIngestionSources" not in data["data"]
+    ):
+        click.echo("No matching ingestion sources found. Please check your filters.")
+        return
+    ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"]
+    if not ingestion_sources:
+        click.echo("No ingestion sources or executions found.")
+        return
+    rows = []
+    for ingestion_source in ingestion_sources:
+        urn = ingestion_source.get("urn", "N/A")
+        name = ingestion_source.get("name", "N/A")
+        executions = ingestion_source.get("executions", {}).get("executionRequests", [])
+        for execution in executions:
+            execution_id = execution.get("id", "N/A")
+            start_time = execution.get("result", {}).get("startTimeMs", "N/A")
+            start_time = (
+                datetime.fromtimestamp(start_time / 1000).strftime("%Y-%m-%d %H:%M:%S")
+                if start_time != "N/A"
+                else "N/A"
+            )
+            status = execution.get("result", {}).get("status", "N/A")
+            rows.append([execution_id, name, start_time, status, urn])
+    click.echo(
+        tabulate(
+            rows,
+            headers=INGEST_SRC_TABLE_COLUMNS,
+            tablefmt="grid",
+        )
+    )
 @ingest.command()
 @click.argument("page_offset", type=int, default=0)
 @click.argument("page_size", type=int, default=100)

datahub/configuration/git.py CHANGED Viewed

@@ -24,7 +24,11 @@ class GitReference(ConfigModel):
         "main",
         description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.",
     )
+    url_subdir: Optional[str] = Field(
+        default=None,
+        description="Prefix to prepend when generating URLs for files - useful when files are in a subdirectory. "
+        "Only affects URL generation, not git operations.",
+    )
     url_template: Optional[str] = Field(
         None,
         description=f"Template for generating a URL to a file in the repo e.g. '{_GITHUB_URL_TEMPLATE}'. We can infer this for GitHub and GitLab repos, and it is otherwise required."
@@ -68,6 +72,8 @@ class GitReference(ConfigModel):
     def get_url_for_file_path(self, file_path: str) -> str:
         assert self.url_template
+        if self.url_subdir:
+            file_path = f"{self.url_subdir}/{file_path}"
         return self.url_template.format(
             repo_url=self.repo, branch=self.branch, file_path=file_path
         )

datahub/ingestion/source/datahub/datahub_database_reader.py CHANGED Viewed

@@ -147,6 +147,47 @@ class DataHubDatabaseReader:
             version
         """
+    def execute_server_cursor(
+        self, query: str, params: Dict[str, Any]
+    ) -> Iterable[Dict[str, Any]]:
+        with self.engine.connect() as conn:
+            if self.engine.dialect.name == "postgresql":
+                with conn.begin():  # Transaction required for PostgreSQL server-side cursor
+                    conn = conn.execution_options(
+                        stream_results=True,
+                        yield_per=self.config.database_query_batch_size,
+                    )
+                    result = conn.execute(query, params)
+                    for row in result:
+                        yield dict(row)
+            elif self.engine.dialect.name == "mysql":  # MySQL
+                import MySQLdb
+                with contextlib.closing(
+                    conn.connection.cursor(MySQLdb.cursors.SSCursor)
+                ) as cursor:
+                    logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
+                    cursor.execute(query, params)
+                    columns = [desc[0] for desc in cursor.description]
+                    while True:
+                        rows = cursor.fetchmany(self.config.database_query_batch_size)
+                        if not rows:
+                            break  # Use break instead of return in generator
+                        for row in rows:
+                            yield dict(zip(columns, row))
+            else:
+                raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
+    def _get_rows(
+        self, from_createdon: datetime, stop_time: datetime
+    ) -> Iterable[Dict[str, Any]]:
+        params = {
+            "exclude_aspects": list(self.config.exclude_aspects),
+            "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
+        }
+        yield from self.execute_server_cursor(self.query, params)
     def get_aspects(
         self, from_createdon: datetime, stop_time: datetime
     ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
@@ -159,27 +200,6 @@ class DataHubDatabaseReader:
             if mcp:
                 yield mcp, row["createdon"]
-    def _get_rows(
-        self, from_createdon: datetime, stop_time: datetime
-    ) -> Iterable[Dict[str, Any]]:
-        with self.engine.connect() as conn:
-            with contextlib.closing(conn.connection.cursor()) as cursor:
-                cursor.execute(
-                    self.query,
-                    {
-                        "exclude_aspects": list(self.config.exclude_aspects),
-                        "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
-                    },
-                )
-                columns = [desc[0] for desc in cursor.description]
-                while True:
-                    rows = cursor.fetchmany(self.config.database_query_batch_size)
-                    if not rows:
-                        return
-                    for row in rows:
-                        yield dict(zip(columns, row))
     def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
         """
         Fetches all soft-deleted entities from the database.

datahub/ingestion/source/datahub/datahub_source.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from datetime import datetime, timezone
+from datetime import datetime, timedelta, timezone
 from functools import partial
 from typing import Dict, Iterable, List, Optional
@@ -26,6 +26,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionSourceBase,
 )
 from datahub.metadata.schema_classes import ChangeTypeClass
+from datahub.utilities.progress_timer import ProgressTimer
 logger = logging.getLogger(__name__)
@@ -105,11 +106,17 @@ class DataHubSource(StatefulIngestionSourceBase):
         self, from_createdon: datetime, reader: DataHubDatabaseReader
     ) -> Iterable[MetadataWorkUnit]:
         logger.info(f"Fetching database aspects starting from {from_createdon}")
+        progress = ProgressTimer(report_every=timedelta(seconds=60))
         mcps = reader.get_aspects(from_createdon, self.report.stop_time)
         for i, (mcp, createdon) in enumerate(mcps):
             if not self.urn_pattern.allowed(str(mcp.entityUrn)):
                 continue
+            if progress.should_report():
+                logger.info(
+                    f"Ingested {i} database aspects so far, currently at {createdon}"
+                )
             yield mcp.as_workunit()
             self.report.num_database_aspects_ingested += 1

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -9,6 +9,7 @@ from datetime import datetime
 from itertools import groupby
 from pathlib import PurePath
 from typing import Any, Dict, Iterable, List, Optional, Tuple
+from urllib.parse import urlparse
 import smart_open.compression as so_compression
 from more_itertools import peekable
@@ -993,9 +994,7 @@ class S3Source(StatefulIngestionSourceBase):
                         folders = []
                         for dir in dirs_to_process:
                             logger.info(f"Getting files from folder: {dir}")
-                            prefix_to_process = dir.rstrip("\\").lstrip(
-                                self.create_s3_path(bucket_name, "/")
-                            )
+                            prefix_to_process = urlparse(dir).path.lstrip("/")
                             folders.extend(
                                 self.get_folder_info(

datahub/ingestion/source/snowflake/snowflake_lineage_v2.py CHANGED Viewed

@@ -414,9 +414,13 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
         except Exception as e:
             self.report.num_upstream_lineage_edge_parsing_failed += 1
             upstream_tables = db_row.get("UPSTREAM_TABLES")
+            downstream_table = db_row.get("DOWNSTREAM_TABLE_NAME")
             self.structured_reporter.warning(
                 "Failed to parse lineage edge",
-                context=f"Upstreams: {upstream_tables} Downstreams: {db_row.get('DOWNSTREAM_TABLE_NAME')}",
+                # Tricky: sometimes the full row data is too large, and so the context
+                # message gets truncated. By pulling out the upstreams and downstream
+                # list, we can at least get the important fields if truncation does occur.
+                context=f"Upstreams: {upstream_tables} Downstream: {downstream_table} Full row: {db_row}",
                 exc=e,
             )
             return None

datahub/ingestion/source/tableau/tableau.py CHANGED Viewed

@@ -111,6 +111,8 @@ from datahub.ingestion.source.tableau.tableau_common import (
     tableau_field_to_schema_field,
     workbook_graphql_query,
 )
+from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
+from datahub.ingestion.source.tableau.tableau_validation import check_user_role
 from datahub.metadata.com.linkedin.pegasus2avro.common import (
     AuditStamp,
     ChangeAuditStamps,
@@ -167,7 +169,7 @@ from datahub.utilities.urns.dataset_urn import DatasetUrn
 try:
     # On earlier versions of the tableauserverclient, the NonXMLResponseError
-    # was thrown when reauthentication was needed. We'll keep both exceptions
+    # was thrown when reauthentication was necessary. We'll keep both exceptions
     # around for now, but can remove this in the future.
     from tableauserverclient.server.endpoint.exceptions import (  # type: ignore
         NotSignedInError,
@@ -632,6 +634,33 @@ class TableauSourceReport(StaleEntityRemovalSourceReport):
     num_upstream_table_lineage_failed_parse_sql: int = 0
     num_upstream_fine_grained_lineage_failed_parse_sql: int = 0
     num_hidden_assets_skipped: int = 0
+    logged_in_user: List[UserInfo] = []
+def report_user_role(report: TableauSourceReport, server: Server) -> None:
+    title: str = "Insufficient Permissions"
+    message: str = "The user must have the `Site Administrator Explorer` role to perform metadata ingestion."
+    try:
+        # TableauSiteSource instance is per site, so each time we need to find-out user detail
+        # the site-role might be different on another site
+        logged_in_user: UserInfo = UserInfo.from_server(server=server)
+        if not logged_in_user.is_site_administrator_explorer():
+            report.warning(
+                title=title,
+                message=message,
+                context=f"user-name={logged_in_user.user_name}, role={logged_in_user.site_role}, site_id={logged_in_user.site_id}",
+            )
+        report.logged_in_user.append(logged_in_user)
+    except Exception as e:
+        report.warning(
+            title=title,
+            message="Failed to verify the user's role. The user must have `Site Administrator Explorer` role.",
+            context=f"{e}",
+            exc=e,
+        )
 @platform_name("Tableau")
@@ -676,6 +705,7 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
         try:
             logger.info(f"Authenticated to Tableau site: '{site_content_url}'")
             self.server = self.config.make_tableau_client(site_content_url)
+            report_user_role(report=self.report, server=self.server)
         # Note that we're not catching ConfigurationError, since we want that to throw.
         except ValueError as e:
             self.report.failure(
@@ -689,9 +719,17 @@ class TableauSource(StatefulIngestionSourceBase, TestableSource):
         test_report = TestConnectionReport()
         try:
             source_config = TableauConfig.parse_obj_allow_extras(config_dict)
-            source_config.make_tableau_client(source_config.site)
+            server = source_config.make_tableau_client(source_config.site)
             test_report.basic_connectivity = CapabilityReport(capable=True)
+            test_report.capability_report = check_user_role(
+                logged_in_user=UserInfo.from_server(server=server)
+            )
         except Exception as e:
+            logger.warning(f"{e}", exc_info=e)
             test_report.basic_connectivity = CapabilityReport(
                 capable=False, failure_reason=str(e)
             )
@@ -831,6 +869,8 @@ class TableauSiteSource:
         # when emitting custom SQL data sources.
         self.custom_sql_ids_being_used: List[str] = []
+        report_user_role(report=report, server=server)
     @property
     def no_env_browse_prefix(self) -> str:
         # Prefix to use with browse path (v1)
@@ -1290,7 +1330,6 @@ class TableauSiteSource:
         page_size = page_size_override or self.config.page_size
         filter_pages = get_filter_pages(query_filter, page_size)
         for filter_page in filter_pages:
             has_next_page = 1
             current_cursor: Optional[str] = None

datahub/ingestion/source/tableau/tableau_common.py CHANGED Viewed

@@ -975,15 +975,22 @@ def get_filter_pages(query_filter: dict, page_size: int) -> List[dict]:
     # a few ten thousand, then tableau server responds with empty response
     # causing below error:
     # tableauserverclient.server.endpoint.exceptions.NonXMLResponseError: b''
+    # in practice, we only do pagination if len(query_filter.keys()) == 1
+    if len(query_filter.keys()) != 1:
+        return filter_pages
+    current_key = (list(query_filter.keys()))[0]
     if (
-        len(query_filter.keys()) == 1
-        and query_filter.get(c.ID_WITH_IN)
-        and isinstance(query_filter[c.ID_WITH_IN], list)
+        current_key in [c.ID_WITH_IN, c.PROJECT_NAME_WITH_IN]
+        and query_filter.get(current_key)
+        and isinstance(query_filter[current_key], list)
     ):
-        ids = query_filter[c.ID_WITH_IN]
+        ids = query_filter[current_key]
         filter_pages = [
             {
-                c.ID_WITH_IN: ids[
+                current_key: ids[
                     start : (
                         start + page_size if start + page_size < len(ids) else len(ids)
                     )

datahub/ingestion/source/tableau/tableau_constant.py CHANGED Viewed

@@ -81,3 +81,5 @@ EMBEDDED_DATA_SOURCES_CONNECTION = "embeddedDatasourcesConnection"
 PROJECT = "Project"
 SITE = "Site"
 IS_UNSUPPORTED_CUSTOM_SQL = "isUnsupportedCustomSql"
+SITE_PERMISSION = "sitePermission"
+SITE_ROLE = "SiteAdministratorExplorer"

datahub/ingestion/source/tableau/tableau_server_wrapper.py ADDED Viewed

@@ -0,0 +1,33 @@
+from dataclasses import dataclass
+from tableauserverclient import Server, UserItem
+from datahub.ingestion.source.tableau import tableau_constant as c
+@dataclass
+class UserInfo:
+    user_name: str
+    site_role: str
+    site_id: str
+    def is_site_administrator_explorer(self):
+        return self.site_role == c.SITE_ROLE
+    @staticmethod
+    def from_server(server: Server) -> "UserInfo":
+        assert server.user_id, "make the connection with tableau"
+        user: UserItem = server.users.get_by_id(server.user_id)
+        assert user.site_role, "site_role is not available"  # to silent the lint
+        assert user.name, "user name is not available"  # to silent the lint
+        assert server.site_id, "site identifier is not available"  # to silent the lint
+        return UserInfo(
+            user_name=user.name,
+            site_role=user.site_role,
+            site_id=server.site_id,
+        )

datahub/ingestion/source/tableau/tableau_validation.py ADDED Viewed

@@ -0,0 +1,48 @@
+import logging
+from typing import Dict, Union
+from datahub.ingestion.api.source import CapabilityReport, SourceCapability
+from datahub.ingestion.source.tableau import tableau_constant as c
+from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo
+logger = logging.getLogger(__name__)
+def check_user_role(
+    logged_in_user: UserInfo,
+) -> Dict[Union[SourceCapability, str], CapabilityReport]:
+    capability_dict: Dict[Union[SourceCapability, str], CapabilityReport] = {
+        c.SITE_PERMISSION: CapabilityReport(
+            capable=True,
+        )
+    }
+    failure_reason: str = (
+        "The user does not have the `Site Administrator Explorer` role."
+    )
+    mitigation_message_prefix: str = (
+        "Assign `Site Administrator Explorer` role to the user"
+    )
+    mitigation_message_suffix: str = "Refer to the setup guide: https://datahubproject.io/docs/quick-ingestion-guides/tableau/setup"
+    try:
+        # TODO: Add check for `Enable Derived Permissions`
+        if not logged_in_user.is_site_administrator_explorer():
+            capability_dict[c.SITE_PERMISSION] = CapabilityReport(
+                capable=False,
+                failure_reason=f"{failure_reason} Their current role is {logged_in_user.site_role}.",
+                mitigation_message=f"{mitigation_message_prefix} `{logged_in_user.user_name}`. {mitigation_message_suffix}",
+            )
+        return capability_dict
+    except Exception as e:
+        logger.warning(msg=e, exc_info=e)
+        capability_dict[c.SITE_PERMISSION] = CapabilityReport(
+            capable=False,
+            failure_reason="Failed to verify user role.",
+            mitigation_message=f"{mitigation_message_prefix}. {mitigation_message_suffix}",  # user is unknown
+        )
+        return capability_dict

datahub/testing/compare_metadata_json.py CHANGED Viewed

@@ -117,7 +117,7 @@ def diff_metadata_json(
     ignore_paths: Sequence[str] = (),
     ignore_order: bool = True,
 ) -> Union[DeepDiff, MCPDiff]:
-    ignore_paths = (*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info")
+    ignore_paths = [*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info"]
     try:
         if ignore_order:
             golden_map = get_aspects_by_urn(golden)

acryl-datahub 0.15.0rc21__py3-none-any.whl → 0.15.0rc23__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0rc21py3-none-any.whl → 0.15.0rc23py3-none-any.whl