PyPI - castor-extractor - Versions diffs - 0.19.8__py3-none-any.whl → 0.20.4__py3-none-any.whl - Mend

castor-extractor 0.19.8py3-none-any.whl → 0.20.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of castor-extractor might be problematic. Click here for more details.

Files changed (46) hide show

CHANGELOG.md CHANGED Viewed

@@ -1,6 +1,30 @@
 # Changelog
+## 0.20.4 - 2024-10-09
+* Salesforce warehouse: `Labels` instead of `api_names` for columns
+## 0.20.3 - 2024-10-03
+* Looker: no longer extract `as_html` dashboard elements
+## 0.20.2 - 2024-09-24
+* Thoughtspot: Adding connector
+## 0.20.1 - 2024-09-23
+* Power BI: Improved client based on APIClient
+## 0.20.0 - 2024-09-23
+* Switch to Tableau revamped connector
+## 0.19.9 - 2024-09-19
+* Databricks: multithreading to retrieve column lineage
 ## 0.19.8 - 2024-09-18
 * Metabase: Handle duplicate dashboards

castor_extractor/commands/extract_tableau.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import logging
 from argparse import ArgumentParser
-from castor_extractor.visualization import tableau  # type: ignore
+from castor_extractor.utils import parse_filled_arguments  # type: ignore
+from castor_extractor.visualization import tableau_revamp  # type: ignore
 logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
@@ -19,29 +20,14 @@ def main():
     parser.add_argument("-b", "--server-url", help="Tableau server url")
     parser.add_argument("-i", "--site-id", help="Tableau site ID")
     parser.add_argument(
-        "-s",
-        "--safe-mode",
-        help="Tableau safe mode",
+        "--with-pulse",
+        dest="with_pulse",
         action="store_true",
+        help="Extract Tableau Pulse assets: Metrics and Subscriptions",
     )
-    parser.add_argument("-o", "--output", help="Directory to write to")
-    args = parser.parse_args()
-    client = tableau.ApiClient(
-        user=args.user,
-        password=args.password,
-        token_name=args.token_name,
-        token=args.token,
-        server_url=args.server_url,
-        site_id=args.site_id,
-        safe_mode=args.safe_mode,
-    )
-    client.login()
+    parser.add_argument("-o", "--output", help="Directory to write to")
-    tableau.extract_all(
-        client,
-        output_directory=args.output,
-    )
+    tableau_revamp.extract_all(**parse_filled_arguments(parser))

castor_extractor/commands/extract_thoughtspot.py ADDED Viewed

@@ -0,0 +1,18 @@
+import logging
+from argparse import ArgumentParser
+from castor_extractor.utils import parse_filled_arguments  # type: ignore
+from castor_extractor.visualization import thoughtspot  # type: ignore
+logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("-b", "--base_url", help="base url")
+    parser.add_argument("-u", "--username", help="username")
+    parser.add_argument("-p", "--password", help="password")
+    parser.add_argument("-o", "--output", help="Directory to write to")
+    thoughtspot.extract_all(**parse_filled_arguments(parser))

castor_extractor/utils/__init__.py CHANGED Viewed

@@ -34,7 +34,7 @@ from .pager import (
     PagerOnIdLogger,
     PagerStopStrategy,
 )
-from .retry import RetryStrategy, retry
+from .retry import RetryStrategy, retry, retry_request
 from .safe import SafeMode, safe_mode
 from .store import AbstractStorage, LocalStorage
 from .string import decode_when_bytes, string_to_tuple

castor_extractor/utils/client/api/client.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from http import HTTPStatus
-from typing import Dict, Literal, Optional, Tuple
+from typing import Callable, Dict, Literal, Optional, Tuple
 import requests
 from requests import Response
@@ -26,18 +26,18 @@ def _generate_payloads(
     params: Optional[dict],
     data: Optional[dict],
     pagination_params: Optional[dict],
-) -> Tuple[dict, dict]:
+) -> Tuple[Optional[dict], Optional[dict]]:
     _pagination_params = pagination_params or {}
-    params = params or {}
-    data = data or {}
     if method == "GET":
+        params = params or {}
         params = {**params, **_pagination_params}
-    elif method == "POST":
+        return data, params
+    if method == "POST":
+        data = data or {}
         data = {**data, **_pagination_params}
-    else:
-        raise ValueError(f"Method {method} is not yet supported")
-    return data, params
+        return data, params
+    raise ValueError(f"Method {method} is not yet supported")
 class APIClient:
@@ -137,12 +137,15 @@ class APIClient:
         endpoint: str,
         *,
         headers: Headers = None,
+        params: Optional[dict] = None,
         data: Optional[dict] = None,
         pagination_params: Optional[dict] = None,
+        handler: Optional[Callable] = None,
     ):
         response = self._call(
             method="POST",
             endpoint=endpoint,
+            params=params,
             data=data,
             pagination_params=pagination_params,
             headers=headers,
@@ -150,4 +153,6 @@ class APIClient:
         if response.status_code == HTTPStatus.UNAUTHORIZED:
             self._auth.refresh_token()
-        return handle_response(response, safe_mode=self._safe_mode)
+        return handle_response(
+            response, safe_mode=self._safe_mode, handler=handler
+        )

castor_extractor/utils/client/api/pagination.py CHANGED Viewed

@@ -67,7 +67,7 @@ def fetch_all_pages(
     response_payload = request()
     paginated_response = pagination_model(**response_payload)
     while not paginated_response.is_last():
-        logger.info(f"Fetching page number {page_number}")
+        logger.debug(f"Fetching page number {page_number}")
         yield from paginated_response.page_results()
         next_page_parameters = paginated_response.next_page_parameters()
         new_request = partial(request, **next_page_parameters)
@@ -79,5 +79,5 @@ def fetch_all_pages(
         page_number += 1
     # send last page's results
-    logger.info(f"Fetching page number {page_number}")
+    logger.debug(f"Fetching page number {page_number}")
     yield from paginated_response.page_results()

castor_extractor/utils/client/api/safe_request.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 from requests import HTTPError, Response
@@ -41,9 +41,10 @@ class RequestSafeMode:
 def handle_response(
     response: Response,
     safe_mode: Optional[RequestSafeMode] = None,
+    handler: Optional[Callable] = None,
 ) -> Any:
     """
-    Util to handle a HTTP Response based on the response status code and the
+    Util to handle HTTP Response based on the response status code and the
     safe mode used
     """
     safe_mode = safe_mode if safe_mode else RequestSafeMode()
@@ -56,4 +57,6 @@ def handle_response(
         logger.error(f"Safe mode : skip request with error {e}")
         logger.debug(e, exc_info=True)
         return {}
-    return response.json()
+    if not handler:
+        return response.json()
+    return handler(response)

castor_extractor/utils/write.py CHANGED Viewed

@@ -35,7 +35,7 @@ def write_json(filename: str, data: Any):
     """
     with open(filename, "w", encoding=ENCODING) as f:
         json.dump(data, f)
-        logger.info(f"Wrote output file: {filename}")
+        logger.info(f"Wrote output file: {filename} ({f.tell()} bytes)")
 def _current_version() -> str:

castor_extractor/visualization/looker/api/constants.py CHANGED Viewed

@@ -32,14 +32,10 @@ DASHBOARD_FILTERS = (
 DASHBOARD_ELEMENTS = (
     "id",
     "body_text",
-    "body_text_as_html",
     "note_text",
-    "note_text_as_html",
     "subtitle_text",
-    "subtitle_text_as_html",
     "title",
     "title_text",
-    "title_text_as_html",
     "title_hidden",
     "type",
     {

castor_extractor/visualization/powerbi/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from .assets import PowerBiAsset
-from .client import Client, PowerbiCredentials, Urls
+from .client import DEFAULT_SCOPE, PowerbiClient, PowerbiCredentials
 from .extract import extract_all

castor_extractor/visualization/powerbi/assets.py CHANGED Viewed

@@ -11,16 +11,19 @@ class PowerBiAsset(ExternalAsset):
     DATASETS = "datasets"
     DATASET_FIELDS = "dataset_fields"
     METADATA = "metadata"
+    PAGES = "pages"
     REPORTS = "reports"
     TABLES = "tables"
+    TILES = "tiles"
     USERS = "users"
     @classproperty
     def optional(cls) -> Set["PowerBiAsset"]:
         return {
-            PowerBiAsset.DASHBOARDS,
             PowerBiAsset.DATASET_FIELDS,
+            PowerBiAsset.PAGES,
             PowerBiAsset.TABLES,
+            PowerBiAsset.TILES,
             PowerBiAsset.USERS,
         }
@@ -30,5 +33,8 @@ class PowerBiAsset(ExternalAsset):
 METADATA_ASSETS = (
     PowerBiAsset.DATASET_FIELDS,
     PowerBiAsset.TABLES,
+    PowerBiAsset.TILES,
     PowerBiAsset.USERS,
 )
+REPORTS_ASSETS = (PowerBiAsset.PAGES,)

castor_extractor/visualization/powerbi/client/__init__.py CHANGED Viewed

@@ -1,3 +1,2 @@
-from .constants import Urls
-from .credentials import PowerbiCredentials
-from .rest import Client
+from .client import PowerbiClient
+from .credentials import DEFAULT_SCOPE, PowerbiCredentials

castor_extractor/visualization/powerbi/client/authentication.py ADDED Viewed

@@ -0,0 +1,27 @@
+import msal  # type: ignore
+from ....utils import BearerAuth
+from .constants import Keys
+from .credentials import PowerbiCredentials
+from .endpoints import PowerBiEndpointFactory
+class PowerBiBearerAuth(BearerAuth):
+    def __init__(self, credentials: PowerbiCredentials):
+        self.credentials = credentials
+        authority = PowerBiEndpointFactory.authority(self.credentials.tenant_id)
+        self.app = msal.ConfidentialClientApplication(
+            client_id=self.credentials.client_id,
+            authority=authority,
+            client_credential=self.credentials.secret,
+        )
+    def fetch_token(self):
+        token = self.app.acquire_token_for_client(
+            scopes=self.credentials.scopes
+        )
+        if Keys.ACCESS_TOKEN not in token:
+            raise ValueError(f"No access token in token response: {token}")
+        return token[Keys.ACCESS_TOKEN]

castor_extractor/visualization/powerbi/client/client.py ADDED Viewed

@@ -0,0 +1,207 @@
+import logging
+from datetime import date
+from functools import partial
+from time import sleep
+from typing import Dict, Iterator, List, Optional, Union
+import requests
+from requests import HTTPError
+from ....utils import (
+    APIClient,
+    fetch_all_pages,
+)
+from ..assets import PowerBiAsset
+from .authentication import PowerBiBearerAuth
+from .constants import Keys
+from .credentials import PowerbiCredentials
+from .endpoints import PowerBiEndpointFactory
+from .pagination import PowerBiPagination
+POWERBI_DEFAULT_TIMEOUT_S = 30
+# The route we use to fetch workspaces info can retrieve a maximum of
+# 100 workspaces per call
+# More: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-post-workspace-info#request-body
+METADATA_BATCH_SIZE = 100
+POWERBI_SCAN_STATUS_DONE = "Succeeded"
+POWERBI_SCAN_SLEEP_S = 1
+logger = logging.getLogger(__name__)
+class PowerbiClient(APIClient):
+    def __init__(
+        self,
+        credentials: PowerbiCredentials,
+    ):
+        auth = PowerBiBearerAuth(credentials=credentials)
+        super().__init__(
+            auth=auth,
+            timeout=POWERBI_DEFAULT_TIMEOUT_S,
+        )
+    def _activity_events(self, day: Optional[date] = None) -> Iterator[Dict]:
+        """
+        Returns a list of activity events for the organization.
+        https://learn.microsoft.com/en-us/power-bi/admin/service-admin-auditing#activityevents-rest-api
+        - when no day is specified, fallback is yesterday
+        """
+        request = partial(
+            self._get,
+            endpoint=PowerBiEndpointFactory.activity_events(day),
+        )
+        yield from fetch_all_pages(request, PowerBiPagination)
+    def _datasets(self) -> Iterator[Dict]:
+        """
+        Returns a list of datasets for the organization.
+        https://learn.microsoft.com/en-us/rest/api/power-bi/admin/datasets-get-datasets-as-admin
+        """
+        yield from self._get(PowerBiEndpointFactory.datasets())[Keys.VALUE]
+    def _dashboards(self) -> Iterator[Dict]:
+        """
+        Returns a list of dashboards for the organization.
+        https://learn.microsoft.com/en-us/rest/api/power-bi/admin/dashboards-get-dashboards-as-admin
+        """
+        yield from self._get(PowerBiEndpointFactory.dashboards())[Keys.VALUE]
+    def _reports(self) -> Iterator[Dict]:
+        """
+        Returns a list of reports for the organization.
+        https://learn.microsoft.com/en-us/rest/api/power-bi/admin/reports-get-reports-as-admin
+        """
+        reports_endpoint = PowerBiEndpointFactory.reports()
+        reports = self._get(reports_endpoint)[Keys.VALUE]
+        for report in reports:
+            report_id = report.get(Keys.ID)
+            try:
+                pages_endpoint = PowerBiEndpointFactory.pages(report_id)
+                pages = self._get(pages_endpoint)[Keys.VALUE]
+                report["pages"] = pages
+            except (requests.HTTPError, requests.exceptions.Timeout) as e:
+                logger.debug(e)
+                continue
+        return reports
+    def _workspace_ids(self) -> List[str]:
+        """
+        Get workspaces ids from powerBI admin API.
+        more: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-get-modified-workspaces
+        """
+        params: Dict[str, Union[bool, str]] = {
+            Keys.INACTIVE_WORKSPACES: True,
+            Keys.PERSONAL_WORKSPACES: True,
+        }
+        response = self._get(
+            PowerBiEndpointFactory.workspace_ids(),
+            params=params,
+        )
+        return [x[Keys.ID] for x in response]
+    def _get_scan_result(self, scan_id: int) -> Iterator[Dict]:
+        endpoint = PowerBiEndpointFactory.metadata_scan_result(scan_id)
+        yield from self._get(endpoint)[Keys.WORKSPACES]
+    def _wait_for_scan_result(self, scan_id: int) -> bool:
+        """
+        Periodically checks the status of the metadata scan until the results
+        are ready.
+        """
+        endpoint = PowerBiEndpointFactory.metadata_scan_status(scan_id)
+        total_waiting_time_s = 0
+        while total_waiting_time_s < POWERBI_DEFAULT_TIMEOUT_S:
+            try:
+                result = self._get(endpoint)
+            except HTTPError as e:
+                logger.error(f"Scan {scan_id} failed. Error: {e}")
+                return False
+            if result[Keys.STATUS] == POWERBI_SCAN_STATUS_DONE:
+                logger.info(f"scan {scan_id} ready")
+                return True
+            total_waiting_time_s += POWERBI_SCAN_SLEEP_S
+            logger.info(
+                f"Waiting {POWERBI_SCAN_SLEEP_S} sec for scan {scan_id} to be ready…",
+            )
+            sleep(POWERBI_SCAN_SLEEP_S)
+        logger.warning(f"Scan {scan_id} timed out")
+        return False
+    def _create_scan(self, workspaces_ids: List[str]) -> int:
+        """
+        Tells the Power BI API to start an asynchronous metadata scan.
+        Returns the scan's ID.
+        https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-post-workspace-info
+        """
+        params = {
+            "datasetExpressions": True,
+            "datasetSchema": True,
+            "datasourceDetails": True,
+            "getArtifactUsers": True,
+            "lineage": True,
+        }
+        request_body = {"workspaces": workspaces_ids}
+        scan_id = self._post(
+            PowerBiEndpointFactory.metadata_create_scan(),
+            params=params,
+            data=request_body,
+        )
+        return scan_id[Keys.ID]
+    def _metadata(self) -> Iterator[Dict]:
+        """
+        Fetch metadata by workspace. The metadata scanning is asynchronous and
+        requires the following steps:
+        - create the asynchronous scan
+        - periodically check the scan status to know when it's finished
+        - get the actual scan results
+        https://learn.microsoft.com/en-us/power-bi/enterprise/service-admin-metadata-scanning
+        """
+        ids = self._workspace_ids()
+        for index in range(0, len(ids), METADATA_BATCH_SIZE):
+            batch_ids = ids[index : index + METADATA_BATCH_SIZE]
+            scan_id = self._create_scan(batch_ids)
+            self._wait_for_scan_result(scan_id)
+            yield from self._get_scan_result(scan_id)
+    def test_connection(self) -> None:
+        """Use credentials & verify requesting the API doesn't raise an error"""
+        self._auth.refresh_token()
+    def fetch(
+        self,
+        asset: PowerBiAsset,
+        *,
+        day: Optional[date] = None,
+    ) -> Iterator[Dict]:
+        """
+        Given a PowerBi asset, returns the corresponding data using the
+        appropriate client.
+        """
+        if asset == PowerBiAsset.ACTIVITY_EVENTS:
+            yield from self._activity_events(day=day)
+        elif asset == PowerBiAsset.DATASETS:
+            yield from self._datasets()
+        elif asset == PowerBiAsset.DASHBOARDS:
+            yield from self._dashboards()
+        elif asset == PowerBiAsset.REPORTS:
+            yield from self._reports()
+        elif asset == PowerBiAsset.METADATA:
+            yield from self._metadata()
+        else:
+            raise ValueError(f"This asset {asset} is unknown")

castor_extractor/visualization/powerbi/client/client_test.py ADDED Viewed

@@ -0,0 +1,173 @@
+from datetime import date
+from unittest.mock import Mock, call, patch
+import pytest
+from .authentication import msal
+from .client import PowerbiClient
+from .constants import Keys
+from .credentials import PowerbiCredentials
+from .endpoints import PowerBiEndpointFactory
+FAKE_TENANT_ID = "IamFake"
+FAKE_CLIENT_ID = "MeTwo"
+FAKE_SECRET = "MeThree"
+@pytest.fixture
+def mock_msal():
+    with patch.object(msal, "ConfidentialClientApplication") as mock_app:
+        mock_app.return_value.acquire_token_for_client.return_value = {
+            "access_token": "fake_token"
+        }
+        yield mock_app
+@pytest.fixture
+def power_bi_client(mock_msal):
+    creds = PowerbiCredentials(
+        tenant_id=FAKE_TENANT_ID,
+        client_id=FAKE_CLIENT_ID,
+        secret=FAKE_SECRET,
+    )
+    return PowerbiClient(creds)
+def test__access_token(power_bi_client, mock_msal):
+    # Valid token scenario
+    valid_token = "mock_token"
+    mock_response = {"access_token": valid_token}
+    returning_valid_token = Mock(return_value=mock_response)
+    mock_msal.return_value.acquire_token_for_client = returning_valid_token
+    assert power_bi_client._auth.fetch_token() == valid_token
+    # Invalid token scenario
+    invalid_response = {"not_access_token": "666"}
+    returning_invalid_token = Mock(return_value=invalid_response)
+    mock_msal.return_value.acquire_token_for_client = returning_invalid_token
+    with pytest.raises(ValueError):
+        power_bi_client._auth.fetch_token()
+def test__datasets(power_bi_client):
+    with patch.object(power_bi_client, "_get") as mocked_get:
+        mocked_get.return_value = {"value": [{"id": 1, "type": "dataset"}]}
+        datasets = list(power_bi_client._datasets())
+        mocked_get.assert_called_with(PowerBiEndpointFactory.datasets())
+        assert datasets == [{"id": 1, "type": "dataset"}]
+def test__dashboards(power_bi_client):
+    with patch.object(power_bi_client, "_get") as mocked_get:
+        mocked_get.return_value = {"value": [{"id": 1, "type": "dashboard"}]}
+        dashboards = list(power_bi_client._dashboards())
+        mocked_get.assert_called_with(PowerBiEndpointFactory.dashboards())
+        assert dashboards == [{"id": 1, "type": "dashboard"}]
+def test__reports(power_bi_client):
+    with patch.object(power_bi_client, "_get") as mocked_get:
+        mocked_get.side_effect = [
+            {"value": [{"id": 1, "type": "report"}]},
+            {
+                "value": [
+                    {"name": "page_name", "displayName": "page", "order": 0}
+                ]
+            },
+        ]
+        reports = list(power_bi_client._reports())
+        calls = [
+            call(PowerBiEndpointFactory.reports()),
+            call(PowerBiEndpointFactory.pages("1")),
+        ]
+        mocked_get.assert_has_calls(calls)
+        assert reports == [
+            {
+                "id": 1,
+                "type": "report",
+                "pages": [
+                    {"name": "page_name", "displayName": "page", "order": 0}
+                ],
+            }
+        ]
+def test__workspace_ids(power_bi_client):
+    with patch.object(power_bi_client, "_get") as mocked_get:
+        mocked_get.return_value = [{"id": 1000}, {"id": 1001}, {"id": 1003}]
+        ids = power_bi_client._workspace_ids()
+        assert ids == [1000, 1001, 1003]
+        params = {
+            Keys.INACTIVE_WORKSPACES: True,
+            Keys.PERSONAL_WORKSPACES: True,
+        }
+        mocked_get.assert_called_with(
+            PowerBiEndpointFactory.workspace_ids(),
+            params=params,
+        )
+@patch.object(PowerbiClient, "_get_scan_result")
+@patch.object(PowerbiClient, "_wait_for_scan_result")
+@patch.object(PowerbiClient, "_create_scan")
+@patch.object(PowerbiClient, "_workspace_ids")
+def test__metadata(
+    mock_workspace_ids,
+    mock_create_scan,
+    mock_wait_for_scan,
+    mock_get_scan_result,
+    power_bi_client,
+):
+    mock_workspace_ids.return_value = list(range(200))
+    mock_create_scan.return_value = 314
+    mock_wait_for_scan.return_value = True
+    mock_get_scan_result.return_value = [{"workspace_id": 1871}]
+    result = list(power_bi_client._metadata())
+    assert result == [{"workspace_id": 1871}, {"workspace_id": 1871}]
+def test__activity_events(power_bi_client):
+    day = date.today()
+    mocked_get_results = [
+        {
+            Keys.ACTIVITY_EVENT_ENTITIES: ["foo", "bar"],
+            Keys.LAST_RESULT_SET: False,
+            Keys.CONTINUATION_URI: "https://next-call-1",
+        },
+        {
+            Keys.ACTIVITY_EVENT_ENTITIES: ["baz"],
+            Keys.LAST_RESULT_SET: False,
+            Keys.CONTINUATION_URI: "https://next-call-2",
+        },
+        {
+            Keys.ACTIVITY_EVENT_ENTITIES: ["biz"],
+            Keys.LAST_RESULT_SET: True,
+            Keys.CONTINUATION_URI: None,
+        },
+    ]
+    with patch.object(power_bi_client, "_get") as mocked_get:
+        mocked_get.side_effect = mocked_get_results
+        result = list(power_bi_client._activity_events(day=day))
+        assert result == ["foo", "bar", "baz", "biz"]
+        expected_calls = [
+            call(endpoint=PowerBiEndpointFactory.activity_events(day=day)),
+            call(endpoint="https://next-call-1"),
+            call(endpoint="https://next-call-2"),
+        ]
+        mocked_get.assert_has_calls(expected_calls)
+def test_test_connection(power_bi_client):
+    with patch.object(power_bi_client._auth, "refresh_token") as mock_refresh:
+        power_bi_client.test_connection()
+        mock_refresh.assert_called_once()

castor-extractor 0.19.8__py3-none-any.whl → 0.20.4__py3-none-any.whl

Potentially problematic release.

castor-extractor 0.19.8py3-none-any.whl → 0.20.4py3-none-any.whl