castor-extractor 0.19.8__py3-none-any.whl → 0.20.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +24 -0
- castor_extractor/commands/extract_tableau.py +8 -22
- castor_extractor/commands/extract_thoughtspot.py +18 -0
- castor_extractor/utils/__init__.py +1 -1
- castor_extractor/utils/client/api/client.py +14 -9
- castor_extractor/utils/client/api/pagination.py +2 -2
- castor_extractor/utils/client/api/safe_request.py +6 -3
- castor_extractor/utils/write.py +1 -1
- castor_extractor/visualization/looker/api/constants.py +0 -4
- castor_extractor/visualization/powerbi/__init__.py +1 -1
- castor_extractor/visualization/powerbi/assets.py +7 -1
- castor_extractor/visualization/powerbi/client/__init__.py +2 -3
- castor_extractor/visualization/powerbi/client/authentication.py +27 -0
- castor_extractor/visualization/powerbi/client/client.py +207 -0
- castor_extractor/visualization/powerbi/client/client_test.py +173 -0
- castor_extractor/visualization/powerbi/client/constants.py +0 -67
- castor_extractor/visualization/powerbi/client/credentials.py +3 -4
- castor_extractor/visualization/powerbi/client/credentials_test.py +3 -4
- castor_extractor/visualization/powerbi/client/endpoints.py +65 -0
- castor_extractor/visualization/powerbi/client/pagination.py +32 -0
- castor_extractor/visualization/powerbi/extract.py +14 -9
- castor_extractor/visualization/tableau_revamp/extract.py +10 -7
- castor_extractor/visualization/thoughtspot/__init__.py +3 -0
- castor_extractor/visualization/thoughtspot/assets.py +9 -0
- castor_extractor/visualization/thoughtspot/client/__init__.py +2 -0
- castor_extractor/visualization/thoughtspot/client/client.py +120 -0
- castor_extractor/visualization/thoughtspot/client/credentials.py +18 -0
- castor_extractor/visualization/thoughtspot/client/endpoints.py +12 -0
- castor_extractor/visualization/thoughtspot/client/utils.py +25 -0
- castor_extractor/visualization/thoughtspot/client/utils_test.py +57 -0
- castor_extractor/visualization/thoughtspot/extract.py +49 -0
- castor_extractor/warehouse/databricks/api_client.py +6 -0
- castor_extractor/warehouse/databricks/client.py +11 -5
- castor_extractor/warehouse/salesforce/client.py +1 -1
- castor_extractor/warehouse/salesforce/format.py +40 -30
- castor_extractor/warehouse/salesforce/format_test.py +61 -24
- castor_extractor/warehouse/sqlserver/client.py +2 -2
- {castor_extractor-0.19.8.dist-info → castor_extractor-0.20.4.dist-info}/METADATA +25 -1
- {castor_extractor-0.19.8.dist-info → castor_extractor-0.20.4.dist-info}/RECORD +42 -31
- {castor_extractor-0.19.8.dist-info → castor_extractor-0.20.4.dist-info}/entry_points.txt +1 -0
- castor_extractor/visualization/powerbi/client/rest.py +0 -305
- castor_extractor/visualization/powerbi/client/rest_test.py +0 -290
- castor_extractor/visualization/powerbi/client/utils.py +0 -19
- castor_extractor/visualization/powerbi/client/utils_test.py +0 -24
- {castor_extractor-0.19.8.dist-info → castor_extractor-0.20.4.dist-info}/LICENCE +0 -0
- {castor_extractor-0.19.8.dist-info → castor_extractor-0.20.4.dist-info}/WHEEL +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,6 +1,30 @@
|
|
|
1
1
|
|
|
2
2
|
# Changelog
|
|
3
3
|
|
|
4
|
+
## 0.20.4 - 2024-10-09
|
|
5
|
+
|
|
6
|
+
* Salesforce warehouse: `Labels` instead of `api_names` for columns
|
|
7
|
+
|
|
8
|
+
## 0.20.3 - 2024-10-03
|
|
9
|
+
|
|
10
|
+
* Looker: no longer extract `as_html` dashboard elements
|
|
11
|
+
|
|
12
|
+
## 0.20.2 - 2024-09-24
|
|
13
|
+
|
|
14
|
+
* Thoughtspot: Adding connector
|
|
15
|
+
|
|
16
|
+
## 0.20.1 - 2024-09-23
|
|
17
|
+
|
|
18
|
+
* Power BI: Improved client based on APIClient
|
|
19
|
+
|
|
20
|
+
## 0.20.0 - 2024-09-23
|
|
21
|
+
|
|
22
|
+
* Switch to Tableau revamped connector
|
|
23
|
+
|
|
24
|
+
## 0.19.9 - 2024-09-19
|
|
25
|
+
|
|
26
|
+
* Databricks: multithreading to retrieve column lineage
|
|
27
|
+
|
|
4
28
|
## 0.19.8 - 2024-09-18
|
|
5
29
|
|
|
6
30
|
* Metabase: Handle duplicate dashboards
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from argparse import ArgumentParser
|
|
3
3
|
|
|
4
|
-
from castor_extractor.
|
|
4
|
+
from castor_extractor.utils import parse_filled_arguments # type: ignore
|
|
5
|
+
from castor_extractor.visualization import tableau_revamp # type: ignore
|
|
5
6
|
|
|
6
7
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
|
|
7
8
|
|
|
@@ -19,29 +20,14 @@ def main():
|
|
|
19
20
|
|
|
20
21
|
parser.add_argument("-b", "--server-url", help="Tableau server url")
|
|
21
22
|
parser.add_argument("-i", "--site-id", help="Tableau site ID")
|
|
23
|
+
|
|
22
24
|
parser.add_argument(
|
|
23
|
-
"-
|
|
24
|
-
"
|
|
25
|
-
help="Tableau safe mode",
|
|
25
|
+
"--with-pulse",
|
|
26
|
+
dest="with_pulse",
|
|
26
27
|
action="store_true",
|
|
28
|
+
help="Extract Tableau Pulse assets: Metrics and Subscriptions",
|
|
27
29
|
)
|
|
28
|
-
parser.add_argument("-o", "--output", help="Directory to write to")
|
|
29
30
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
client = tableau.ApiClient(
|
|
33
|
-
user=args.user,
|
|
34
|
-
password=args.password,
|
|
35
|
-
token_name=args.token_name,
|
|
36
|
-
token=args.token,
|
|
37
|
-
server_url=args.server_url,
|
|
38
|
-
site_id=args.site_id,
|
|
39
|
-
safe_mode=args.safe_mode,
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
client.login()
|
|
31
|
+
parser.add_argument("-o", "--output", help="Directory to write to")
|
|
43
32
|
|
|
44
|
-
|
|
45
|
-
client,
|
|
46
|
-
output_directory=args.output,
|
|
47
|
-
)
|
|
33
|
+
tableau_revamp.extract_all(**parse_filled_arguments(parser))
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from argparse import ArgumentParser
|
|
3
|
+
|
|
4
|
+
from castor_extractor.utils import parse_filled_arguments # type: ignore
|
|
5
|
+
from castor_extractor.visualization import thoughtspot # type: ignore
|
|
6
|
+
|
|
7
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
parser = ArgumentParser()
|
|
12
|
+
|
|
13
|
+
parser.add_argument("-b", "--base_url", help="base url")
|
|
14
|
+
parser.add_argument("-u", "--username", help="username")
|
|
15
|
+
parser.add_argument("-p", "--password", help="password")
|
|
16
|
+
parser.add_argument("-o", "--output", help="Directory to write to")
|
|
17
|
+
|
|
18
|
+
thoughtspot.extract_all(**parse_filled_arguments(parser))
|
|
@@ -34,7 +34,7 @@ from .pager import (
|
|
|
34
34
|
PagerOnIdLogger,
|
|
35
35
|
PagerStopStrategy,
|
|
36
36
|
)
|
|
37
|
-
from .retry import RetryStrategy, retry
|
|
37
|
+
from .retry import RetryStrategy, retry, retry_request
|
|
38
38
|
from .safe import SafeMode, safe_mode
|
|
39
39
|
from .store import AbstractStorage, LocalStorage
|
|
40
40
|
from .string import decode_when_bytes, string_to_tuple
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from http import HTTPStatus
|
|
3
|
-
from typing import Dict, Literal, Optional, Tuple
|
|
3
|
+
from typing import Callable, Dict, Literal, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
6
|
from requests import Response
|
|
@@ -26,18 +26,18 @@ def _generate_payloads(
|
|
|
26
26
|
params: Optional[dict],
|
|
27
27
|
data: Optional[dict],
|
|
28
28
|
pagination_params: Optional[dict],
|
|
29
|
-
) -> Tuple[dict, dict]:
|
|
29
|
+
) -> Tuple[Optional[dict], Optional[dict]]:
|
|
30
30
|
_pagination_params = pagination_params or {}
|
|
31
|
-
params = params or {}
|
|
32
|
-
data = data or {}
|
|
33
31
|
|
|
34
32
|
if method == "GET":
|
|
33
|
+
params = params or {}
|
|
35
34
|
params = {**params, **_pagination_params}
|
|
36
|
-
|
|
35
|
+
return data, params
|
|
36
|
+
if method == "POST":
|
|
37
|
+
data = data or {}
|
|
37
38
|
data = {**data, **_pagination_params}
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
return data, params
|
|
39
|
+
return data, params
|
|
40
|
+
raise ValueError(f"Method {method} is not yet supported")
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
class APIClient:
|
|
@@ -137,12 +137,15 @@ class APIClient:
|
|
|
137
137
|
endpoint: str,
|
|
138
138
|
*,
|
|
139
139
|
headers: Headers = None,
|
|
140
|
+
params: Optional[dict] = None,
|
|
140
141
|
data: Optional[dict] = None,
|
|
141
142
|
pagination_params: Optional[dict] = None,
|
|
143
|
+
handler: Optional[Callable] = None,
|
|
142
144
|
):
|
|
143
145
|
response = self._call(
|
|
144
146
|
method="POST",
|
|
145
147
|
endpoint=endpoint,
|
|
148
|
+
params=params,
|
|
146
149
|
data=data,
|
|
147
150
|
pagination_params=pagination_params,
|
|
148
151
|
headers=headers,
|
|
@@ -150,4 +153,6 @@ class APIClient:
|
|
|
150
153
|
if response.status_code == HTTPStatus.UNAUTHORIZED:
|
|
151
154
|
self._auth.refresh_token()
|
|
152
155
|
|
|
153
|
-
return handle_response(
|
|
156
|
+
return handle_response(
|
|
157
|
+
response, safe_mode=self._safe_mode, handler=handler
|
|
158
|
+
)
|
|
@@ -67,7 +67,7 @@ def fetch_all_pages(
|
|
|
67
67
|
response_payload = request()
|
|
68
68
|
paginated_response = pagination_model(**response_payload)
|
|
69
69
|
while not paginated_response.is_last():
|
|
70
|
-
logger.
|
|
70
|
+
logger.debug(f"Fetching page number {page_number}")
|
|
71
71
|
yield from paginated_response.page_results()
|
|
72
72
|
next_page_parameters = paginated_response.next_page_parameters()
|
|
73
73
|
new_request = partial(request, **next_page_parameters)
|
|
@@ -79,5 +79,5 @@ def fetch_all_pages(
|
|
|
79
79
|
page_number += 1
|
|
80
80
|
|
|
81
81
|
# send last page's results
|
|
82
|
-
logger.
|
|
82
|
+
logger.debug(f"Fetching page number {page_number}")
|
|
83
83
|
yield from paginated_response.page_results()
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, List, Optional, Tuple, Union
|
|
2
|
+
from typing import Any, Callable, List, Optional, Tuple, Union
|
|
3
3
|
|
|
4
4
|
from requests import HTTPError, Response
|
|
5
5
|
|
|
@@ -41,9 +41,10 @@ class RequestSafeMode:
|
|
|
41
41
|
def handle_response(
|
|
42
42
|
response: Response,
|
|
43
43
|
safe_mode: Optional[RequestSafeMode] = None,
|
|
44
|
+
handler: Optional[Callable] = None,
|
|
44
45
|
) -> Any:
|
|
45
46
|
"""
|
|
46
|
-
Util to handle
|
|
47
|
+
Util to handle HTTP Response based on the response status code and the
|
|
47
48
|
safe mode used
|
|
48
49
|
"""
|
|
49
50
|
safe_mode = safe_mode if safe_mode else RequestSafeMode()
|
|
@@ -56,4 +57,6 @@ def handle_response(
|
|
|
56
57
|
logger.error(f"Safe mode : skip request with error {e}")
|
|
57
58
|
logger.debug(e, exc_info=True)
|
|
58
59
|
return {}
|
|
59
|
-
|
|
60
|
+
if not handler:
|
|
61
|
+
return response.json()
|
|
62
|
+
return handler(response)
|
castor_extractor/utils/write.py
CHANGED
|
@@ -35,7 +35,7 @@ def write_json(filename: str, data: Any):
|
|
|
35
35
|
"""
|
|
36
36
|
with open(filename, "w", encoding=ENCODING) as f:
|
|
37
37
|
json.dump(data, f)
|
|
38
|
-
logger.info(f"Wrote output file: {filename}")
|
|
38
|
+
logger.info(f"Wrote output file: {filename} ({f.tell()} bytes)")
|
|
39
39
|
|
|
40
40
|
|
|
41
41
|
def _current_version() -> str:
|
|
@@ -32,14 +32,10 @@ DASHBOARD_FILTERS = (
|
|
|
32
32
|
DASHBOARD_ELEMENTS = (
|
|
33
33
|
"id",
|
|
34
34
|
"body_text",
|
|
35
|
-
"body_text_as_html",
|
|
36
35
|
"note_text",
|
|
37
|
-
"note_text_as_html",
|
|
38
36
|
"subtitle_text",
|
|
39
|
-
"subtitle_text_as_html",
|
|
40
37
|
"title",
|
|
41
38
|
"title_text",
|
|
42
|
-
"title_text_as_html",
|
|
43
39
|
"title_hidden",
|
|
44
40
|
"type",
|
|
45
41
|
{
|
|
@@ -11,16 +11,19 @@ class PowerBiAsset(ExternalAsset):
|
|
|
11
11
|
DATASETS = "datasets"
|
|
12
12
|
DATASET_FIELDS = "dataset_fields"
|
|
13
13
|
METADATA = "metadata"
|
|
14
|
+
PAGES = "pages"
|
|
14
15
|
REPORTS = "reports"
|
|
15
16
|
TABLES = "tables"
|
|
17
|
+
TILES = "tiles"
|
|
16
18
|
USERS = "users"
|
|
17
19
|
|
|
18
20
|
@classproperty
|
|
19
21
|
def optional(cls) -> Set["PowerBiAsset"]:
|
|
20
22
|
return {
|
|
21
|
-
PowerBiAsset.DASHBOARDS,
|
|
22
23
|
PowerBiAsset.DATASET_FIELDS,
|
|
24
|
+
PowerBiAsset.PAGES,
|
|
23
25
|
PowerBiAsset.TABLES,
|
|
26
|
+
PowerBiAsset.TILES,
|
|
24
27
|
PowerBiAsset.USERS,
|
|
25
28
|
}
|
|
26
29
|
|
|
@@ -30,5 +33,8 @@ class PowerBiAsset(ExternalAsset):
|
|
|
30
33
|
METADATA_ASSETS = (
|
|
31
34
|
PowerBiAsset.DATASET_FIELDS,
|
|
32
35
|
PowerBiAsset.TABLES,
|
|
36
|
+
PowerBiAsset.TILES,
|
|
33
37
|
PowerBiAsset.USERS,
|
|
34
38
|
)
|
|
39
|
+
|
|
40
|
+
REPORTS_ASSETS = (PowerBiAsset.PAGES,)
|
|
@@ -1,3 +1,2 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .credentials import PowerbiCredentials
|
|
3
|
-
from .rest import Client
|
|
1
|
+
from .client import PowerbiClient
|
|
2
|
+
from .credentials import DEFAULT_SCOPE, PowerbiCredentials
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import msal # type: ignore
|
|
2
|
+
|
|
3
|
+
from ....utils import BearerAuth
|
|
4
|
+
from .constants import Keys
|
|
5
|
+
from .credentials import PowerbiCredentials
|
|
6
|
+
from .endpoints import PowerBiEndpointFactory
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PowerBiBearerAuth(BearerAuth):
|
|
10
|
+
def __init__(self, credentials: PowerbiCredentials):
|
|
11
|
+
self.credentials = credentials
|
|
12
|
+
authority = PowerBiEndpointFactory.authority(self.credentials.tenant_id)
|
|
13
|
+
self.app = msal.ConfidentialClientApplication(
|
|
14
|
+
client_id=self.credentials.client_id,
|
|
15
|
+
authority=authority,
|
|
16
|
+
client_credential=self.credentials.secret,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def fetch_token(self):
|
|
20
|
+
token = self.app.acquire_token_for_client(
|
|
21
|
+
scopes=self.credentials.scopes
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
if Keys.ACCESS_TOKEN not in token:
|
|
25
|
+
raise ValueError(f"No access token in token response: {token}")
|
|
26
|
+
|
|
27
|
+
return token[Keys.ACCESS_TOKEN]
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import date
|
|
3
|
+
from functools import partial
|
|
4
|
+
from time import sleep
|
|
5
|
+
from typing import Dict, Iterator, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from requests import HTTPError
|
|
9
|
+
|
|
10
|
+
from ....utils import (
|
|
11
|
+
APIClient,
|
|
12
|
+
fetch_all_pages,
|
|
13
|
+
)
|
|
14
|
+
from ..assets import PowerBiAsset
|
|
15
|
+
from .authentication import PowerBiBearerAuth
|
|
16
|
+
from .constants import Keys
|
|
17
|
+
from .credentials import PowerbiCredentials
|
|
18
|
+
from .endpoints import PowerBiEndpointFactory
|
|
19
|
+
from .pagination import PowerBiPagination
|
|
20
|
+
|
|
21
|
+
POWERBI_DEFAULT_TIMEOUT_S = 30
|
|
22
|
+
# The route we use to fetch workspaces info can retrieve a maximum of
|
|
23
|
+
# 100 workspaces per call
|
|
24
|
+
# More: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-post-workspace-info#request-body
|
|
25
|
+
METADATA_BATCH_SIZE = 100
|
|
26
|
+
POWERBI_SCAN_STATUS_DONE = "Succeeded"
|
|
27
|
+
POWERBI_SCAN_SLEEP_S = 1
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PowerbiClient(APIClient):
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
credentials: PowerbiCredentials,
|
|
36
|
+
):
|
|
37
|
+
auth = PowerBiBearerAuth(credentials=credentials)
|
|
38
|
+
super().__init__(
|
|
39
|
+
auth=auth,
|
|
40
|
+
timeout=POWERBI_DEFAULT_TIMEOUT_S,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def _activity_events(self, day: Optional[date] = None) -> Iterator[Dict]:
|
|
44
|
+
"""
|
|
45
|
+
Returns a list of activity events for the organization.
|
|
46
|
+
https://learn.microsoft.com/en-us/power-bi/admin/service-admin-auditing#activityevents-rest-api
|
|
47
|
+
- when no day is specified, fallback is yesterday
|
|
48
|
+
"""
|
|
49
|
+
request = partial(
|
|
50
|
+
self._get,
|
|
51
|
+
endpoint=PowerBiEndpointFactory.activity_events(day),
|
|
52
|
+
)
|
|
53
|
+
yield from fetch_all_pages(request, PowerBiPagination)
|
|
54
|
+
|
|
55
|
+
def _datasets(self) -> Iterator[Dict]:
|
|
56
|
+
"""
|
|
57
|
+
Returns a list of datasets for the organization.
|
|
58
|
+
https://learn.microsoft.com/en-us/rest/api/power-bi/admin/datasets-get-datasets-as-admin
|
|
59
|
+
"""
|
|
60
|
+
yield from self._get(PowerBiEndpointFactory.datasets())[Keys.VALUE]
|
|
61
|
+
|
|
62
|
+
def _dashboards(self) -> Iterator[Dict]:
|
|
63
|
+
"""
|
|
64
|
+
Returns a list of dashboards for the organization.
|
|
65
|
+
https://learn.microsoft.com/en-us/rest/api/power-bi/admin/dashboards-get-dashboards-as-admin
|
|
66
|
+
"""
|
|
67
|
+
yield from self._get(PowerBiEndpointFactory.dashboards())[Keys.VALUE]
|
|
68
|
+
|
|
69
|
+
def _reports(self) -> Iterator[Dict]:
|
|
70
|
+
"""
|
|
71
|
+
Returns a list of reports for the organization.
|
|
72
|
+
https://learn.microsoft.com/en-us/rest/api/power-bi/admin/reports-get-reports-as-admin
|
|
73
|
+
"""
|
|
74
|
+
reports_endpoint = PowerBiEndpointFactory.reports()
|
|
75
|
+
reports = self._get(reports_endpoint)[Keys.VALUE]
|
|
76
|
+
|
|
77
|
+
for report in reports:
|
|
78
|
+
report_id = report.get(Keys.ID)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
pages_endpoint = PowerBiEndpointFactory.pages(report_id)
|
|
82
|
+
pages = self._get(pages_endpoint)[Keys.VALUE]
|
|
83
|
+
report["pages"] = pages
|
|
84
|
+
except (requests.HTTPError, requests.exceptions.Timeout) as e:
|
|
85
|
+
logger.debug(e)
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
return reports
|
|
89
|
+
|
|
90
|
+
def _workspace_ids(self) -> List[str]:
|
|
91
|
+
"""
|
|
92
|
+
Get workspaces ids from powerBI admin API.
|
|
93
|
+
more: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-get-modified-workspaces
|
|
94
|
+
"""
|
|
95
|
+
params: Dict[str, Union[bool, str]] = {
|
|
96
|
+
Keys.INACTIVE_WORKSPACES: True,
|
|
97
|
+
Keys.PERSONAL_WORKSPACES: True,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
response = self._get(
|
|
101
|
+
PowerBiEndpointFactory.workspace_ids(),
|
|
102
|
+
params=params,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return [x[Keys.ID] for x in response]
|
|
106
|
+
|
|
107
|
+
def _get_scan_result(self, scan_id: int) -> Iterator[Dict]:
|
|
108
|
+
endpoint = PowerBiEndpointFactory.metadata_scan_result(scan_id)
|
|
109
|
+
yield from self._get(endpoint)[Keys.WORKSPACES]
|
|
110
|
+
|
|
111
|
+
def _wait_for_scan_result(self, scan_id: int) -> bool:
|
|
112
|
+
"""
|
|
113
|
+
Periodically checks the status of the metadata scan until the results
|
|
114
|
+
are ready.
|
|
115
|
+
"""
|
|
116
|
+
endpoint = PowerBiEndpointFactory.metadata_scan_status(scan_id)
|
|
117
|
+
total_waiting_time_s = 0
|
|
118
|
+
|
|
119
|
+
while total_waiting_time_s < POWERBI_DEFAULT_TIMEOUT_S:
|
|
120
|
+
try:
|
|
121
|
+
result = self._get(endpoint)
|
|
122
|
+
except HTTPError as e:
|
|
123
|
+
logger.error(f"Scan {scan_id} failed. Error: {e}")
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
if result[Keys.STATUS] == POWERBI_SCAN_STATUS_DONE:
|
|
127
|
+
logger.info(f"scan {scan_id} ready")
|
|
128
|
+
return True
|
|
129
|
+
|
|
130
|
+
total_waiting_time_s += POWERBI_SCAN_SLEEP_S
|
|
131
|
+
logger.info(
|
|
132
|
+
f"Waiting {POWERBI_SCAN_SLEEP_S} sec for scan {scan_id} to be ready…",
|
|
133
|
+
)
|
|
134
|
+
sleep(POWERBI_SCAN_SLEEP_S)
|
|
135
|
+
|
|
136
|
+
logger.warning(f"Scan {scan_id} timed out")
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
def _create_scan(self, workspaces_ids: List[str]) -> int:
|
|
140
|
+
"""
|
|
141
|
+
Tells the Power BI API to start an asynchronous metadata scan.
|
|
142
|
+
Returns the scan's ID.
|
|
143
|
+
https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-post-workspace-info
|
|
144
|
+
"""
|
|
145
|
+
params = {
|
|
146
|
+
"datasetExpressions": True,
|
|
147
|
+
"datasetSchema": True,
|
|
148
|
+
"datasourceDetails": True,
|
|
149
|
+
"getArtifactUsers": True,
|
|
150
|
+
"lineage": True,
|
|
151
|
+
}
|
|
152
|
+
request_body = {"workspaces": workspaces_ids}
|
|
153
|
+
scan_id = self._post(
|
|
154
|
+
PowerBiEndpointFactory.metadata_create_scan(),
|
|
155
|
+
params=params,
|
|
156
|
+
data=request_body,
|
|
157
|
+
)
|
|
158
|
+
return scan_id[Keys.ID]
|
|
159
|
+
|
|
160
|
+
def _metadata(self) -> Iterator[Dict]:
|
|
161
|
+
"""
|
|
162
|
+
Fetch metadata by workspace. The metadata scanning is asynchronous and
|
|
163
|
+
requires the following steps:
|
|
164
|
+
- create the asynchronous scan
|
|
165
|
+
- periodically check the scan status to know when it's finished
|
|
166
|
+
- get the actual scan results
|
|
167
|
+
https://learn.microsoft.com/en-us/power-bi/enterprise/service-admin-metadata-scanning
|
|
168
|
+
"""
|
|
169
|
+
ids = self._workspace_ids()
|
|
170
|
+
|
|
171
|
+
for index in range(0, len(ids), METADATA_BATCH_SIZE):
|
|
172
|
+
batch_ids = ids[index : index + METADATA_BATCH_SIZE]
|
|
173
|
+
scan_id = self._create_scan(batch_ids)
|
|
174
|
+
self._wait_for_scan_result(scan_id)
|
|
175
|
+
yield from self._get_scan_result(scan_id)
|
|
176
|
+
|
|
177
|
+
def test_connection(self) -> None:
|
|
178
|
+
"""Use credentials & verify requesting the API doesn't raise an error"""
|
|
179
|
+
self._auth.refresh_token()
|
|
180
|
+
|
|
181
|
+
def fetch(
|
|
182
|
+
self,
|
|
183
|
+
asset: PowerBiAsset,
|
|
184
|
+
*,
|
|
185
|
+
day: Optional[date] = None,
|
|
186
|
+
) -> Iterator[Dict]:
|
|
187
|
+
"""
|
|
188
|
+
Given a PowerBi asset, returns the corresponding data using the
|
|
189
|
+
appropriate client.
|
|
190
|
+
"""
|
|
191
|
+
if asset == PowerBiAsset.ACTIVITY_EVENTS:
|
|
192
|
+
yield from self._activity_events(day=day)
|
|
193
|
+
|
|
194
|
+
elif asset == PowerBiAsset.DATASETS:
|
|
195
|
+
yield from self._datasets()
|
|
196
|
+
|
|
197
|
+
elif asset == PowerBiAsset.DASHBOARDS:
|
|
198
|
+
yield from self._dashboards()
|
|
199
|
+
|
|
200
|
+
elif asset == PowerBiAsset.REPORTS:
|
|
201
|
+
yield from self._reports()
|
|
202
|
+
|
|
203
|
+
elif asset == PowerBiAsset.METADATA:
|
|
204
|
+
yield from self._metadata()
|
|
205
|
+
|
|
206
|
+
else:
|
|
207
|
+
raise ValueError(f"This asset {asset} is unknown")
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
from datetime import date
|
|
2
|
+
from unittest.mock import Mock, call, patch
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from .authentication import msal
|
|
7
|
+
from .client import PowerbiClient
|
|
8
|
+
from .constants import Keys
|
|
9
|
+
from .credentials import PowerbiCredentials
|
|
10
|
+
from .endpoints import PowerBiEndpointFactory
|
|
11
|
+
|
|
12
|
+
FAKE_TENANT_ID = "IamFake"
|
|
13
|
+
FAKE_CLIENT_ID = "MeTwo"
|
|
14
|
+
FAKE_SECRET = "MeThree"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.fixture
|
|
18
|
+
def mock_msal():
|
|
19
|
+
with patch.object(msal, "ConfidentialClientApplication") as mock_app:
|
|
20
|
+
mock_app.return_value.acquire_token_for_client.return_value = {
|
|
21
|
+
"access_token": "fake_token"
|
|
22
|
+
}
|
|
23
|
+
yield mock_app
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.fixture
|
|
27
|
+
def power_bi_client(mock_msal):
|
|
28
|
+
creds = PowerbiCredentials(
|
|
29
|
+
tenant_id=FAKE_TENANT_ID,
|
|
30
|
+
client_id=FAKE_CLIENT_ID,
|
|
31
|
+
secret=FAKE_SECRET,
|
|
32
|
+
)
|
|
33
|
+
return PowerbiClient(creds)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test__access_token(power_bi_client, mock_msal):
|
|
37
|
+
# Valid token scenario
|
|
38
|
+
valid_token = "mock_token"
|
|
39
|
+
mock_response = {"access_token": valid_token}
|
|
40
|
+
returning_valid_token = Mock(return_value=mock_response)
|
|
41
|
+
mock_msal.return_value.acquire_token_for_client = returning_valid_token
|
|
42
|
+
|
|
43
|
+
assert power_bi_client._auth.fetch_token() == valid_token
|
|
44
|
+
|
|
45
|
+
# Invalid token scenario
|
|
46
|
+
invalid_response = {"not_access_token": "666"}
|
|
47
|
+
returning_invalid_token = Mock(return_value=invalid_response)
|
|
48
|
+
mock_msal.return_value.acquire_token_for_client = returning_invalid_token
|
|
49
|
+
|
|
50
|
+
with pytest.raises(ValueError):
|
|
51
|
+
power_bi_client._auth.fetch_token()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test__datasets(power_bi_client):
|
|
55
|
+
with patch.object(power_bi_client, "_get") as mocked_get:
|
|
56
|
+
mocked_get.return_value = {"value": [{"id": 1, "type": "dataset"}]}
|
|
57
|
+
datasets = list(power_bi_client._datasets())
|
|
58
|
+
mocked_get.assert_called_with(PowerBiEndpointFactory.datasets())
|
|
59
|
+
assert datasets == [{"id": 1, "type": "dataset"}]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test__dashboards(power_bi_client):
|
|
63
|
+
with patch.object(power_bi_client, "_get") as mocked_get:
|
|
64
|
+
mocked_get.return_value = {"value": [{"id": 1, "type": "dashboard"}]}
|
|
65
|
+
dashboards = list(power_bi_client._dashboards())
|
|
66
|
+
mocked_get.assert_called_with(PowerBiEndpointFactory.dashboards())
|
|
67
|
+
assert dashboards == [{"id": 1, "type": "dashboard"}]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test__reports(power_bi_client):
|
|
71
|
+
with patch.object(power_bi_client, "_get") as mocked_get:
|
|
72
|
+
mocked_get.side_effect = [
|
|
73
|
+
{"value": [{"id": 1, "type": "report"}]},
|
|
74
|
+
{
|
|
75
|
+
"value": [
|
|
76
|
+
{"name": "page_name", "displayName": "page", "order": 0}
|
|
77
|
+
]
|
|
78
|
+
},
|
|
79
|
+
]
|
|
80
|
+
reports = list(power_bi_client._reports())
|
|
81
|
+
calls = [
|
|
82
|
+
call(PowerBiEndpointFactory.reports()),
|
|
83
|
+
call(PowerBiEndpointFactory.pages("1")),
|
|
84
|
+
]
|
|
85
|
+
mocked_get.assert_has_calls(calls)
|
|
86
|
+
assert reports == [
|
|
87
|
+
{
|
|
88
|
+
"id": 1,
|
|
89
|
+
"type": "report",
|
|
90
|
+
"pages": [
|
|
91
|
+
{"name": "page_name", "displayName": "page", "order": 0}
|
|
92
|
+
],
|
|
93
|
+
}
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test__workspace_ids(power_bi_client):
|
|
98
|
+
with patch.object(power_bi_client, "_get") as mocked_get:
|
|
99
|
+
mocked_get.return_value = [{"id": 1000}, {"id": 1001}, {"id": 1003}]
|
|
100
|
+
|
|
101
|
+
ids = power_bi_client._workspace_ids()
|
|
102
|
+
assert ids == [1000, 1001, 1003]
|
|
103
|
+
|
|
104
|
+
params = {
|
|
105
|
+
Keys.INACTIVE_WORKSPACES: True,
|
|
106
|
+
Keys.PERSONAL_WORKSPACES: True,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
mocked_get.assert_called_with(
|
|
110
|
+
PowerBiEndpointFactory.workspace_ids(),
|
|
111
|
+
params=params,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@patch.object(PowerbiClient, "_get_scan_result")
|
|
116
|
+
@patch.object(PowerbiClient, "_wait_for_scan_result")
|
|
117
|
+
@patch.object(PowerbiClient, "_create_scan")
|
|
118
|
+
@patch.object(PowerbiClient, "_workspace_ids")
|
|
119
|
+
def test__metadata(
|
|
120
|
+
mock_workspace_ids,
|
|
121
|
+
mock_create_scan,
|
|
122
|
+
mock_wait_for_scan,
|
|
123
|
+
mock_get_scan_result,
|
|
124
|
+
power_bi_client,
|
|
125
|
+
):
|
|
126
|
+
mock_workspace_ids.return_value = list(range(200))
|
|
127
|
+
mock_create_scan.return_value = 314
|
|
128
|
+
mock_wait_for_scan.return_value = True
|
|
129
|
+
mock_get_scan_result.return_value = [{"workspace_id": 1871}]
|
|
130
|
+
|
|
131
|
+
result = list(power_bi_client._metadata())
|
|
132
|
+
|
|
133
|
+
assert result == [{"workspace_id": 1871}, {"workspace_id": 1871}]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test__activity_events(power_bi_client):
|
|
137
|
+
day = date.today()
|
|
138
|
+
mocked_get_results = [
|
|
139
|
+
{
|
|
140
|
+
Keys.ACTIVITY_EVENT_ENTITIES: ["foo", "bar"],
|
|
141
|
+
Keys.LAST_RESULT_SET: False,
|
|
142
|
+
Keys.CONTINUATION_URI: "https://next-call-1",
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
Keys.ACTIVITY_EVENT_ENTITIES: ["baz"],
|
|
146
|
+
Keys.LAST_RESULT_SET: False,
|
|
147
|
+
Keys.CONTINUATION_URI: "https://next-call-2",
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
Keys.ACTIVITY_EVENT_ENTITIES: ["biz"],
|
|
151
|
+
Keys.LAST_RESULT_SET: True,
|
|
152
|
+
Keys.CONTINUATION_URI: None,
|
|
153
|
+
},
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
with patch.object(power_bi_client, "_get") as mocked_get:
|
|
157
|
+
mocked_get.side_effect = mocked_get_results
|
|
158
|
+
|
|
159
|
+
result = list(power_bi_client._activity_events(day=day))
|
|
160
|
+
assert result == ["foo", "bar", "baz", "biz"]
|
|
161
|
+
|
|
162
|
+
expected_calls = [
|
|
163
|
+
call(endpoint=PowerBiEndpointFactory.activity_events(day=day)),
|
|
164
|
+
call(endpoint="https://next-call-1"),
|
|
165
|
+
call(endpoint="https://next-call-2"),
|
|
166
|
+
]
|
|
167
|
+
mocked_get.assert_has_calls(expected_calls)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def test_test_connection(power_bi_client):
|
|
171
|
+
with patch.object(power_bi_client._auth, "refresh_token") as mock_refresh:
|
|
172
|
+
power_bi_client.test_connection()
|
|
173
|
+
mock_refresh.assert_called_once()
|