PyPI - castor-extractor - Versions diffs - 0.20.0__py3-none-any.whl → 0.20.5__py3-none-any.whl - Mend

castor-extractor 0.20.0py3-none-any.whl → 0.20.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of castor-extractor might be problematic. Click here for more details.

Files changed (43) hide show

CHANGELOG.md CHANGED Viewed

@@ -1,6 +1,26 @@
 # Changelog
+## 0.20.5 - 2024-10-09
+* Redshift: enable extraction from a Redshift Serverless instance
+## 0.20.4 - 2024-10-09
+* Salesforce warehouse: `Labels` instead of `api_names` for columns
+## 0.20.3 - 2024-10-03
+* Looker: no longer extract `as_html` dashboard elements
+## 0.20.2 - 2024-09-24
+* Thoughtspot: Adding connector
+## 0.20.1 - 2024-09-23
+* Power BI: Improved client based on APIClient
 ## 0.20.0 - 2024-09-23
 * Switch to Tableau revamped connector

castor_extractor/commands/extract_redshift.py CHANGED Viewed

@@ -23,6 +23,11 @@ def main():
         action="store_true",
         help="Skips files already extracted instead of replacing them",
     )
+    parser.add_argument(
+        "--serverless",
+        action="store_true",
+        help="Enables extraction for Redshift Serverless",
+    )
     parser.set_defaults(skip_existing=False)
     args = parser.parse_args()
@@ -34,5 +39,6 @@ def main():
         user=args.user,
         password=args.password,
         output_directory=args.output,
+        serverless=args.serverless,
         skip_existing=args.skip_existing,
     )

castor_extractor/commands/extract_thoughtspot.py ADDED Viewed

@@ -0,0 +1,18 @@
+import logging
+from argparse import ArgumentParser
+from castor_extractor.utils import parse_filled_arguments  # type: ignore
+from castor_extractor.visualization import thoughtspot  # type: ignore
+logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("-b", "--base_url", help="base url")
+    parser.add_argument("-u", "--username", help="username")
+    parser.add_argument("-p", "--password", help="password")
+    parser.add_argument("-o", "--output", help="Directory to write to")
+    thoughtspot.extract_all(**parse_filled_arguments(parser))

castor_extractor/utils/client/api/client.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from http import HTTPStatus
-from typing import Dict, Literal, Optional, Tuple
+from typing import Callable, Dict, Literal, Optional, Tuple
 import requests
 from requests import Response
@@ -137,12 +137,15 @@ class APIClient:
         endpoint: str,
         *,
         headers: Headers = None,
+        params: Optional[dict] = None,
         data: Optional[dict] = None,
         pagination_params: Optional[dict] = None,
+        handler: Optional[Callable] = None,
     ):
         response = self._call(
             method="POST",
             endpoint=endpoint,
+            params=params,
             data=data,
             pagination_params=pagination_params,
             headers=headers,
@@ -150,4 +153,6 @@ class APIClient:
         if response.status_code == HTTPStatus.UNAUTHORIZED:
             self._auth.refresh_token()
-        return handle_response(response, safe_mode=self._safe_mode)
+        return handle_response(
+            response, safe_mode=self._safe_mode, handler=handler
+        )

castor_extractor/utils/client/api/safe_request.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 from requests import HTTPError, Response
@@ -41,9 +41,10 @@ class RequestSafeMode:
 def handle_response(
     response: Response,
     safe_mode: Optional[RequestSafeMode] = None,
+    handler: Optional[Callable] = None,
 ) -> Any:
     """
-    Util to handle a HTTP Response based on the response status code and the
+    Util to handle HTTP Response based on the response status code and the
     safe mode used
     """
     safe_mode = safe_mode if safe_mode else RequestSafeMode()
@@ -56,4 +57,6 @@ def handle_response(
         logger.error(f"Safe mode : skip request with error {e}")
         logger.debug(e, exc_info=True)
         return {}
-    return response.json()
+    if not handler:
+        return response.json()
+    return handler(response)

castor_extractor/visualization/looker/api/constants.py CHANGED Viewed

@@ -32,14 +32,10 @@ DASHBOARD_FILTERS = (
 DASHBOARD_ELEMENTS = (
     "id",
     "body_text",
-    "body_text_as_html",
     "note_text",
-    "note_text_as_html",
     "subtitle_text",
-    "subtitle_text_as_html",
     "title",
     "title_text",
-    "title_text_as_html",
     "title_hidden",
     "type",
     {

castor_extractor/visualization/powerbi/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from .assets import PowerBiAsset
-from .client import Client, PowerbiCredentials, Urls
+from .client import DEFAULT_SCOPE, PowerbiClient, PowerbiCredentials
 from .extract import extract_all

castor_extractor/visualization/powerbi/assets.py CHANGED Viewed

@@ -11,16 +11,19 @@ class PowerBiAsset(ExternalAsset):
     DATASETS = "datasets"
     DATASET_FIELDS = "dataset_fields"
     METADATA = "metadata"
+    PAGES = "pages"
     REPORTS = "reports"
     TABLES = "tables"
+    TILES = "tiles"
     USERS = "users"
     @classproperty
     def optional(cls) -> Set["PowerBiAsset"]:
         return {
-            PowerBiAsset.DASHBOARDS,
             PowerBiAsset.DATASET_FIELDS,
+            PowerBiAsset.PAGES,
             PowerBiAsset.TABLES,
+            PowerBiAsset.TILES,
             PowerBiAsset.USERS,
         }
@@ -30,5 +33,8 @@ class PowerBiAsset(ExternalAsset):
 METADATA_ASSETS = (
     PowerBiAsset.DATASET_FIELDS,
     PowerBiAsset.TABLES,
+    PowerBiAsset.TILES,
     PowerBiAsset.USERS,
 )
+REPORTS_ASSETS = (PowerBiAsset.PAGES,)

castor_extractor/visualization/powerbi/client/__init__.py CHANGED Viewed

@@ -1,3 +1,2 @@
-from .constants import Urls
-from .credentials import PowerbiCredentials
-from .rest import Client
+from .client import PowerbiClient
+from .credentials import DEFAULT_SCOPE, PowerbiCredentials

castor_extractor/visualization/powerbi/client/authentication.py ADDED Viewed

@@ -0,0 +1,27 @@
+import msal  # type: ignore
+from ....utils import BearerAuth
+from .constants import Keys
+from .credentials import PowerbiCredentials
+from .endpoints import PowerBiEndpointFactory
+class PowerBiBearerAuth(BearerAuth):
+    def __init__(self, credentials: PowerbiCredentials):
+        self.credentials = credentials
+        authority = PowerBiEndpointFactory.authority(self.credentials.tenant_id)
+        self.app = msal.ConfidentialClientApplication(
+            client_id=self.credentials.client_id,
+            authority=authority,
+            client_credential=self.credentials.secret,
+        )
+    def fetch_token(self):
+        token = self.app.acquire_token_for_client(
+            scopes=self.credentials.scopes
+        )
+        if Keys.ACCESS_TOKEN not in token:
+            raise ValueError(f"No access token in token response: {token}")
+        return token[Keys.ACCESS_TOKEN]

castor_extractor/visualization/powerbi/client/client.py ADDED Viewed

@@ -0,0 +1,207 @@
+import logging
+from datetime import date
+from functools import partial
+from time import sleep
+from typing import Dict, Iterator, List, Optional, Union
+import requests
+from requests import HTTPError
+from ....utils import (
+    APIClient,
+    fetch_all_pages,
+)
+from ..assets import PowerBiAsset
+from .authentication import PowerBiBearerAuth
+from .constants import Keys
+from .credentials import PowerbiCredentials
+from .endpoints import PowerBiEndpointFactory
+from .pagination import PowerBiPagination
+POWERBI_DEFAULT_TIMEOUT_S = 30
+# The route we use to fetch workspaces info can retrieve a maximum of
+# 100 workspaces per call
+# More: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-post-workspace-info#request-body
+METADATA_BATCH_SIZE = 100
+POWERBI_SCAN_STATUS_DONE = "Succeeded"
+POWERBI_SCAN_SLEEP_S = 1
+logger = logging.getLogger(__name__)
+class PowerbiClient(APIClient):
+    def __init__(
+        self,
+        credentials: PowerbiCredentials,
+    ):
+        auth = PowerBiBearerAuth(credentials=credentials)
+        super().__init__(
+            auth=auth,
+            timeout=POWERBI_DEFAULT_TIMEOUT_S,
+        )
+    def _activity_events(self, day: Optional[date] = None) -> Iterator[Dict]:
+        """
+        Returns a list of activity events for the organization.
+        https://learn.microsoft.com/en-us/power-bi/admin/service-admin-auditing#activityevents-rest-api
+        - when no day is specified, fallback is yesterday
+        """
+        request = partial(
+            self._get,
+            endpoint=PowerBiEndpointFactory.activity_events(day),
+        )
+        yield from fetch_all_pages(request, PowerBiPagination)
+    def _datasets(self) -> Iterator[Dict]:
+        """
+        Returns a list of datasets for the organization.
+        https://learn.microsoft.com/en-us/rest/api/power-bi/admin/datasets-get-datasets-as-admin
+        """
+        yield from self._get(PowerBiEndpointFactory.datasets())[Keys.VALUE]
+    def _dashboards(self) -> Iterator[Dict]:
+        """
+        Returns a list of dashboards for the organization.
+        https://learn.microsoft.com/en-us/rest/api/power-bi/admin/dashboards-get-dashboards-as-admin
+        """
+        yield from self._get(PowerBiEndpointFactory.dashboards())[Keys.VALUE]
+    def _reports(self) -> Iterator[Dict]:
+        """
+        Returns a list of reports for the organization.
+        https://learn.microsoft.com/en-us/rest/api/power-bi/admin/reports-get-reports-as-admin
+        """
+        reports_endpoint = PowerBiEndpointFactory.reports()
+        reports = self._get(reports_endpoint)[Keys.VALUE]
+        for report in reports:
+            report_id = report.get(Keys.ID)
+            try:
+                pages_endpoint = PowerBiEndpointFactory.pages(report_id)
+                pages = self._get(pages_endpoint)[Keys.VALUE]
+                report["pages"] = pages
+            except (requests.HTTPError, requests.exceptions.Timeout) as e:
+                logger.debug(e)
+                continue
+        return reports
+    def _workspace_ids(self) -> List[str]:
+        """
+        Get workspaces ids from powerBI admin API.
+        more: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-get-modified-workspaces
+        """
+        params: Dict[str, Union[bool, str]] = {
+            Keys.INACTIVE_WORKSPACES: True,
+            Keys.PERSONAL_WORKSPACES: True,
+        }
+        response = self._get(
+            PowerBiEndpointFactory.workspace_ids(),
+            params=params,
+        )
+        return [x[Keys.ID] for x in response]
+    def _get_scan_result(self, scan_id: int) -> Iterator[Dict]:
+        endpoint = PowerBiEndpointFactory.metadata_scan_result(scan_id)
+        yield from self._get(endpoint)[Keys.WORKSPACES]
+    def _wait_for_scan_result(self, scan_id: int) -> bool:
+        """
+        Periodically checks the status of the metadata scan until the results
+        are ready.
+        """
+        endpoint = PowerBiEndpointFactory.metadata_scan_status(scan_id)
+        total_waiting_time_s = 0
+        while total_waiting_time_s < POWERBI_DEFAULT_TIMEOUT_S:
+            try:
+                result = self._get(endpoint)
+            except HTTPError as e:
+                logger.error(f"Scan {scan_id} failed. Error: {e}")
+                return False
+            if result[Keys.STATUS] == POWERBI_SCAN_STATUS_DONE:
+                logger.info(f"scan {scan_id} ready")
+                return True
+            total_waiting_time_s += POWERBI_SCAN_SLEEP_S
+            logger.info(
+                f"Waiting {POWERBI_SCAN_SLEEP_S} sec for scan {scan_id} to be ready…",
+            )
+            sleep(POWERBI_SCAN_SLEEP_S)
+        logger.warning(f"Scan {scan_id} timed out")
+        return False
+    def _create_scan(self, workspaces_ids: List[str]) -> int:
+        """
+        Tells the Power BI API to start an asynchronous metadata scan.
+        Returns the scan's ID.
+        https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-post-workspace-info
+        """
+        params = {
+            "datasetExpressions": True,
+            "datasetSchema": True,
+            "datasourceDetails": True,
+            "getArtifactUsers": True,
+            "lineage": True,
+        }
+        request_body = {"workspaces": workspaces_ids}
+        scan_id = self._post(
+            PowerBiEndpointFactory.metadata_create_scan(),
+            params=params,
+            data=request_body,
+        )
+        return scan_id[Keys.ID]
+    def _metadata(self) -> Iterator[Dict]:
+        """
+        Fetch metadata by workspace. The metadata scanning is asynchronous and
+        requires the following steps:
+        - create the asynchronous scan
+        - periodically check the scan status to know when it's finished
+        - get the actual scan results
+        https://learn.microsoft.com/en-us/power-bi/enterprise/service-admin-metadata-scanning
+        """
+        ids = self._workspace_ids()
+        for index in range(0, len(ids), METADATA_BATCH_SIZE):
+            batch_ids = ids[index : index + METADATA_BATCH_SIZE]
+            scan_id = self._create_scan(batch_ids)
+            self._wait_for_scan_result(scan_id)
+            yield from self._get_scan_result(scan_id)
+    def test_connection(self) -> None:
+        """Use credentials & verify requesting the API doesn't raise an error"""
+        self._auth.refresh_token()
+    def fetch(
+        self,
+        asset: PowerBiAsset,
+        *,
+        day: Optional[date] = None,
+    ) -> Iterator[Dict]:
+        """
+        Given a PowerBi asset, returns the corresponding data using the
+        appropriate client.
+        """
+        if asset == PowerBiAsset.ACTIVITY_EVENTS:
+            yield from self._activity_events(day=day)
+        elif asset == PowerBiAsset.DATASETS:
+            yield from self._datasets()
+        elif asset == PowerBiAsset.DASHBOARDS:
+            yield from self._dashboards()
+        elif asset == PowerBiAsset.REPORTS:
+            yield from self._reports()
+        elif asset == PowerBiAsset.METADATA:
+            yield from self._metadata()
+        else:
+            raise ValueError(f"This asset {asset} is unknown")

castor_extractor/visualization/powerbi/client/client_test.py ADDED Viewed

@@ -0,0 +1,173 @@
+from datetime import date
+from unittest.mock import Mock, call, patch
+import pytest
+from .authentication import msal
+from .client import PowerbiClient
+from .constants import Keys
+from .credentials import PowerbiCredentials
+from .endpoints import PowerBiEndpointFactory
+FAKE_TENANT_ID = "IamFake"
+FAKE_CLIENT_ID = "MeTwo"
+FAKE_SECRET = "MeThree"
+@pytest.fixture
+def mock_msal():
+    with patch.object(msal, "ConfidentialClientApplication") as mock_app:
+        mock_app.return_value.acquire_token_for_client.return_value = {
+            "access_token": "fake_token"
+        }
+        yield mock_app
+@pytest.fixture
+def power_bi_client(mock_msal):
+    creds = PowerbiCredentials(
+        tenant_id=FAKE_TENANT_ID,
+        client_id=FAKE_CLIENT_ID,
+        secret=FAKE_SECRET,
+    )
+    return PowerbiClient(creds)
+def test__access_token(power_bi_client, mock_msal):
+    # Valid token scenario
+    valid_token = "mock_token"
+    mock_response = {"access_token": valid_token}
+    returning_valid_token = Mock(return_value=mock_response)
+    mock_msal.return_value.acquire_token_for_client = returning_valid_token
+    assert power_bi_client._auth.fetch_token() == valid_token
+    # Invalid token scenario
+    invalid_response = {"not_access_token": "666"}
+    returning_invalid_token = Mock(return_value=invalid_response)
+    mock_msal.return_value.acquire_token_for_client = returning_invalid_token
+    with pytest.raises(ValueError):
+        power_bi_client._auth.fetch_token()
+def test__datasets(power_bi_client):
+    with patch.object(power_bi_client, "_get") as mocked_get:
+        mocked_get.return_value = {"value": [{"id": 1, "type": "dataset"}]}
+        datasets = list(power_bi_client._datasets())
+        mocked_get.assert_called_with(PowerBiEndpointFactory.datasets())
+        assert datasets == [{"id": 1, "type": "dataset"}]
+def test__dashboards(power_bi_client):
+    with patch.object(power_bi_client, "_get") as mocked_get:
+        mocked_get.return_value = {"value": [{"id": 1, "type": "dashboard"}]}
+        dashboards = list(power_bi_client._dashboards())
+        mocked_get.assert_called_with(PowerBiEndpointFactory.dashboards())
+        assert dashboards == [{"id": 1, "type": "dashboard"}]
+def test__reports(power_bi_client):
+    with patch.object(power_bi_client, "_get") as mocked_get:
+        mocked_get.side_effect = [
+            {"value": [{"id": 1, "type": "report"}]},
+            {
+                "value": [
+                    {"name": "page_name", "displayName": "page", "order": 0}
+                ]
+            },
+        ]
+        reports = list(power_bi_client._reports())
+        calls = [
+            call(PowerBiEndpointFactory.reports()),
+            call(PowerBiEndpointFactory.pages("1")),
+        ]
+        mocked_get.assert_has_calls(calls)
+        assert reports == [
+            {
+                "id": 1,
+                "type": "report",
+                "pages": [
+                    {"name": "page_name", "displayName": "page", "order": 0}
+                ],
+            }
+        ]
+def test__workspace_ids(power_bi_client):
+    with patch.object(power_bi_client, "_get") as mocked_get:
+        mocked_get.return_value = [{"id": 1000}, {"id": 1001}, {"id": 1003}]
+        ids = power_bi_client._workspace_ids()
+        assert ids == [1000, 1001, 1003]
+        params = {
+            Keys.INACTIVE_WORKSPACES: True,
+            Keys.PERSONAL_WORKSPACES: True,
+        }
+        mocked_get.assert_called_with(
+            PowerBiEndpointFactory.workspace_ids(),
+            params=params,
+        )
+@patch.object(PowerbiClient, "_get_scan_result")
+@patch.object(PowerbiClient, "_wait_for_scan_result")
+@patch.object(PowerbiClient, "_create_scan")
+@patch.object(PowerbiClient, "_workspace_ids")
+def test__metadata(
+    mock_workspace_ids,
+    mock_create_scan,
+    mock_wait_for_scan,
+    mock_get_scan_result,
+    power_bi_client,
+):
+    mock_workspace_ids.return_value = list(range(200))
+    mock_create_scan.return_value = 314
+    mock_wait_for_scan.return_value = True
+    mock_get_scan_result.return_value = [{"workspace_id": 1871}]
+    result = list(power_bi_client._metadata())
+    assert result == [{"workspace_id": 1871}, {"workspace_id": 1871}]
+def test__activity_events(power_bi_client):
+    day = date.today()
+    mocked_get_results = [
+        {
+            Keys.ACTIVITY_EVENT_ENTITIES: ["foo", "bar"],
+            Keys.LAST_RESULT_SET: False,
+            Keys.CONTINUATION_URI: "https://next-call-1",
+        },
+        {
+            Keys.ACTIVITY_EVENT_ENTITIES: ["baz"],
+            Keys.LAST_RESULT_SET: False,
+            Keys.CONTINUATION_URI: "https://next-call-2",
+        },
+        {
+            Keys.ACTIVITY_EVENT_ENTITIES: ["biz"],
+            Keys.LAST_RESULT_SET: True,
+            Keys.CONTINUATION_URI: None,
+        },
+    ]
+    with patch.object(power_bi_client, "_get") as mocked_get:
+        mocked_get.side_effect = mocked_get_results
+        result = list(power_bi_client._activity_events(day=day))
+        assert result == ["foo", "bar", "baz", "biz"]
+        expected_calls = [
+            call(endpoint=PowerBiEndpointFactory.activity_events(day=day)),
+            call(endpoint="https://next-call-1"),
+            call(endpoint="https://next-call-2"),
+        ]
+        mocked_get.assert_has_calls(expected_calls)
+def test_test_connection(power_bi_client):
+    with patch.object(power_bi_client._auth, "refresh_token") as mock_refresh:
+        power_bi_client.test_connection()
+        mock_refresh.assert_called_once()

castor_extractor/visualization/powerbi/client/constants.py CHANGED Viewed

@@ -1,62 +1,3 @@
-"""
-File regrouping all constants used in PowerBi client
-"""
-DEFAULT_TIMEOUT_IN_SECS = 30
-SCAN_READY = "Succeeded"
-# ModifiedSince params should not be older than 30 days
-RECENT_DAYS = 30
-GET = "GET"
-POST = "POST"
-class Urls:
-    """PowerBi's urls"""
-    CLIENT_APP_BASE = "https://login.microsoftonline.com/"
-    DEFAULT_SCOPE = "https://analysis.windows.net/powerbi/api/.default"
-    REST_API_BASE_PATH = "https://api.powerbi.com/v1.0/myorg"
-    # PBI rest API Routes
-    ACTIVITY_EVENTS = f"{REST_API_BASE_PATH}/admin/activityevents"
-    DASHBOARD = f"{REST_API_BASE_PATH}/admin/dashboards"
-    DATASETS = f"{REST_API_BASE_PATH}/admin/datasets"
-    GROUPS = f"{REST_API_BASE_PATH}/admin/groups"
-    METADATA_GET = f"{REST_API_BASE_PATH}/admin/workspaces/scanResult"
-    METADATA_POST = f"{REST_API_BASE_PATH}/admin/workspaces/getInfo"
-    METADATA_WAIT = f"{REST_API_BASE_PATH}/admin/workspaces/scanStatus"
-    REPORTS = f"{REST_API_BASE_PATH}/admin/reports"
-    WORKSPACE_IDS = (
-        "https://api.powerbi.com/v1.0/myorg/admin/workspaces/modified"
-    )
-class Batches:
-    """Batches used within PowerBI api calls"""
-    DEFAULT = 100
-    # The route we use to fetch workspaces info can retrieve a maximum of
-    # 100 workspaces per call
-    # More: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-post-workspace-info#request-body
-    METADATA = 100
-class QueryParams:
-    """
-    Frequently used PowerBi query params
-    """
-    METADATA_SCAN = {
-        "datasetExpressions": True,
-        "datasetSchema": True,
-        "datasourceDetails": True,
-        "getArtifactUsers": True,
-        "lineage": True,
-    }
-    ACTIVE_WORKSPACE_FILTER = "state eq 'Active' and type eq 'Workspace'"
 class Keys:
     ACCESS_TOKEN = "access_token"  # noqa: S105
     ACTIVITY_EVENT_ENTITIES = "activityEventEntities"
@@ -64,15 +5,7 @@ class Keys:
     ID = "id"
     INACTIVE_WORKSPACES = "excludeInActiveWorkspaces"
     LAST_RESULT_SET = "lastResultSet"
-    MODIFIED_SINCE = "modifiedSince"
     PERSONAL_WORKSPACES = "excludePersonalWorkspaces"
     STATUS = "status"
     VALUE = "value"
     WORKSPACES = "workspaces"
-class Assertions:
-    """Assertion's messages"""
-    BATCH_TOO_BIG = f"Can not retrieve more than {Batches.METADATA} at the time"
-    DATETIME_TOO_OLD = "Date must be within 30 days range"

castor-extractor 0.20.0__py3-none-any.whl → 0.20.5__py3-none-any.whl

Potentially problematic release.

castor-extractor 0.20.0py3-none-any.whl → 0.20.5py3-none-any.whl