castor-extractor 0.20.0__py3-none-any.whl → 0.20.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +20 -0
- castor_extractor/commands/extract_redshift.py +6 -0
- castor_extractor/commands/extract_thoughtspot.py +18 -0
- castor_extractor/utils/client/api/client.py +7 -2
- castor_extractor/utils/client/api/safe_request.py +6 -3
- castor_extractor/visualization/looker/api/constants.py +0 -4
- castor_extractor/visualization/powerbi/__init__.py +1 -1
- castor_extractor/visualization/powerbi/assets.py +7 -1
- castor_extractor/visualization/powerbi/client/__init__.py +2 -3
- castor_extractor/visualization/powerbi/client/authentication.py +27 -0
- castor_extractor/visualization/powerbi/client/client.py +207 -0
- castor_extractor/visualization/powerbi/client/client_test.py +173 -0
- castor_extractor/visualization/powerbi/client/constants.py +0 -67
- castor_extractor/visualization/powerbi/client/credentials.py +3 -4
- castor_extractor/visualization/powerbi/client/credentials_test.py +3 -4
- castor_extractor/visualization/powerbi/client/endpoints.py +65 -0
- castor_extractor/visualization/powerbi/client/pagination.py +32 -0
- castor_extractor/visualization/powerbi/extract.py +14 -9
- castor_extractor/visualization/thoughtspot/__init__.py +3 -0
- castor_extractor/visualization/thoughtspot/assets.py +9 -0
- castor_extractor/visualization/thoughtspot/client/__init__.py +2 -0
- castor_extractor/visualization/thoughtspot/client/client.py +120 -0
- castor_extractor/visualization/thoughtspot/client/credentials.py +18 -0
- castor_extractor/visualization/thoughtspot/client/endpoints.py +12 -0
- castor_extractor/visualization/thoughtspot/client/utils.py +25 -0
- castor_extractor/visualization/thoughtspot/client/utils_test.py +57 -0
- castor_extractor/visualization/thoughtspot/extract.py +49 -0
- castor_extractor/warehouse/redshift/extract.py +10 -1
- castor_extractor/warehouse/redshift/extract_test.py +26 -0
- castor_extractor/warehouse/redshift/queries/query_serverless.sql +69 -0
- castor_extractor/warehouse/redshift/query.py +12 -1
- castor_extractor/warehouse/salesforce/client.py +1 -1
- castor_extractor/warehouse/salesforce/format.py +40 -30
- castor_extractor/warehouse/salesforce/format_test.py +61 -24
- {castor_extractor-0.20.0.dist-info → castor_extractor-0.20.5.dist-info}/METADATA +21 -1
- {castor_extractor-0.20.0.dist-info → castor_extractor-0.20.5.dist-info}/RECORD +39 -26
- {castor_extractor-0.20.0.dist-info → castor_extractor-0.20.5.dist-info}/entry_points.txt +1 -0
- castor_extractor/visualization/powerbi/client/rest.py +0 -305
- castor_extractor/visualization/powerbi/client/rest_test.py +0 -290
- castor_extractor/visualization/powerbi/client/utils.py +0 -19
- castor_extractor/visualization/powerbi/client/utils_test.py +0 -24
- {castor_extractor-0.20.0.dist-info → castor_extractor-0.20.5.dist-info}/LICENCE +0 -0
- {castor_extractor-0.20.0.dist-info → castor_extractor-0.20.5.dist-info}/WHEEL +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,6 +1,26 @@
|
|
|
1
1
|
|
|
2
2
|
# Changelog
|
|
3
3
|
|
|
4
|
+
## 0.20.5 - 2024-10-09
|
|
5
|
+
|
|
6
|
+
* Redshift: enable extraction from a Redshift Serverless instance
|
|
7
|
+
|
|
8
|
+
## 0.20.4 - 2024-10-09
|
|
9
|
+
|
|
10
|
+
* Salesforce warehouse: `Labels` instead of `api_names` for columns
|
|
11
|
+
|
|
12
|
+
## 0.20.3 - 2024-10-03
|
|
13
|
+
|
|
14
|
+
* Looker: no longer extract `as_html` dashboard elements
|
|
15
|
+
|
|
16
|
+
## 0.20.2 - 2024-09-24
|
|
17
|
+
|
|
18
|
+
* Thoughtspot: Adding connector
|
|
19
|
+
|
|
20
|
+
## 0.20.1 - 2024-09-23
|
|
21
|
+
|
|
22
|
+
* Power BI: Improved client based on APIClient
|
|
23
|
+
|
|
4
24
|
## 0.20.0 - 2024-09-23
|
|
5
25
|
|
|
6
26
|
* Switch to Tableau revamped connector
|
|
@@ -23,6 +23,11 @@ def main():
|
|
|
23
23
|
action="store_true",
|
|
24
24
|
help="Skips files already extracted instead of replacing them",
|
|
25
25
|
)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--serverless",
|
|
28
|
+
action="store_true",
|
|
29
|
+
help="Enables extraction for Redshift Serverless",
|
|
30
|
+
)
|
|
26
31
|
parser.set_defaults(skip_existing=False)
|
|
27
32
|
|
|
28
33
|
args = parser.parse_args()
|
|
@@ -34,5 +39,6 @@ def main():
|
|
|
34
39
|
user=args.user,
|
|
35
40
|
password=args.password,
|
|
36
41
|
output_directory=args.output,
|
|
42
|
+
serverless=args.serverless,
|
|
37
43
|
skip_existing=args.skip_existing,
|
|
38
44
|
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from argparse import ArgumentParser
|
|
3
|
+
|
|
4
|
+
from castor_extractor.utils import parse_filled_arguments # type: ignore
|
|
5
|
+
from castor_extractor.visualization import thoughtspot # type: ignore
|
|
6
|
+
|
|
7
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
parser = ArgumentParser()
|
|
12
|
+
|
|
13
|
+
parser.add_argument("-b", "--base_url", help="base url")
|
|
14
|
+
parser.add_argument("-u", "--username", help="username")
|
|
15
|
+
parser.add_argument("-p", "--password", help="password")
|
|
16
|
+
parser.add_argument("-o", "--output", help="Directory to write to")
|
|
17
|
+
|
|
18
|
+
thoughtspot.extract_all(**parse_filled_arguments(parser))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from http import HTTPStatus
|
|
3
|
-
from typing import Dict, Literal, Optional, Tuple
|
|
3
|
+
from typing import Callable, Dict, Literal, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
import requests
|
|
6
6
|
from requests import Response
|
|
@@ -137,12 +137,15 @@ class APIClient:
|
|
|
137
137
|
endpoint: str,
|
|
138
138
|
*,
|
|
139
139
|
headers: Headers = None,
|
|
140
|
+
params: Optional[dict] = None,
|
|
140
141
|
data: Optional[dict] = None,
|
|
141
142
|
pagination_params: Optional[dict] = None,
|
|
143
|
+
handler: Optional[Callable] = None,
|
|
142
144
|
):
|
|
143
145
|
response = self._call(
|
|
144
146
|
method="POST",
|
|
145
147
|
endpoint=endpoint,
|
|
148
|
+
params=params,
|
|
146
149
|
data=data,
|
|
147
150
|
pagination_params=pagination_params,
|
|
148
151
|
headers=headers,
|
|
@@ -150,4 +153,6 @@ class APIClient:
|
|
|
150
153
|
if response.status_code == HTTPStatus.UNAUTHORIZED:
|
|
151
154
|
self._auth.refresh_token()
|
|
152
155
|
|
|
153
|
-
return handle_response(
|
|
156
|
+
return handle_response(
|
|
157
|
+
response, safe_mode=self._safe_mode, handler=handler
|
|
158
|
+
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, List, Optional, Tuple, Union
|
|
2
|
+
from typing import Any, Callable, List, Optional, Tuple, Union
|
|
3
3
|
|
|
4
4
|
from requests import HTTPError, Response
|
|
5
5
|
|
|
@@ -41,9 +41,10 @@ class RequestSafeMode:
|
|
|
41
41
|
def handle_response(
|
|
42
42
|
response: Response,
|
|
43
43
|
safe_mode: Optional[RequestSafeMode] = None,
|
|
44
|
+
handler: Optional[Callable] = None,
|
|
44
45
|
) -> Any:
|
|
45
46
|
"""
|
|
46
|
-
Util to handle
|
|
47
|
+
Util to handle HTTP Response based on the response status code and the
|
|
47
48
|
safe mode used
|
|
48
49
|
"""
|
|
49
50
|
safe_mode = safe_mode if safe_mode else RequestSafeMode()
|
|
@@ -56,4 +57,6 @@ def handle_response(
|
|
|
56
57
|
logger.error(f"Safe mode : skip request with error {e}")
|
|
57
58
|
logger.debug(e, exc_info=True)
|
|
58
59
|
return {}
|
|
59
|
-
|
|
60
|
+
if not handler:
|
|
61
|
+
return response.json()
|
|
62
|
+
return handler(response)
|
|
@@ -32,14 +32,10 @@ DASHBOARD_FILTERS = (
|
|
|
32
32
|
DASHBOARD_ELEMENTS = (
|
|
33
33
|
"id",
|
|
34
34
|
"body_text",
|
|
35
|
-
"body_text_as_html",
|
|
36
35
|
"note_text",
|
|
37
|
-
"note_text_as_html",
|
|
38
36
|
"subtitle_text",
|
|
39
|
-
"subtitle_text_as_html",
|
|
40
37
|
"title",
|
|
41
38
|
"title_text",
|
|
42
|
-
"title_text_as_html",
|
|
43
39
|
"title_hidden",
|
|
44
40
|
"type",
|
|
45
41
|
{
|
|
@@ -11,16 +11,19 @@ class PowerBiAsset(ExternalAsset):
|
|
|
11
11
|
DATASETS = "datasets"
|
|
12
12
|
DATASET_FIELDS = "dataset_fields"
|
|
13
13
|
METADATA = "metadata"
|
|
14
|
+
PAGES = "pages"
|
|
14
15
|
REPORTS = "reports"
|
|
15
16
|
TABLES = "tables"
|
|
17
|
+
TILES = "tiles"
|
|
16
18
|
USERS = "users"
|
|
17
19
|
|
|
18
20
|
@classproperty
|
|
19
21
|
def optional(cls) -> Set["PowerBiAsset"]:
|
|
20
22
|
return {
|
|
21
|
-
PowerBiAsset.DASHBOARDS,
|
|
22
23
|
PowerBiAsset.DATASET_FIELDS,
|
|
24
|
+
PowerBiAsset.PAGES,
|
|
23
25
|
PowerBiAsset.TABLES,
|
|
26
|
+
PowerBiAsset.TILES,
|
|
24
27
|
PowerBiAsset.USERS,
|
|
25
28
|
}
|
|
26
29
|
|
|
@@ -30,5 +33,8 @@ class PowerBiAsset(ExternalAsset):
|
|
|
30
33
|
METADATA_ASSETS = (
|
|
31
34
|
PowerBiAsset.DATASET_FIELDS,
|
|
32
35
|
PowerBiAsset.TABLES,
|
|
36
|
+
PowerBiAsset.TILES,
|
|
33
37
|
PowerBiAsset.USERS,
|
|
34
38
|
)
|
|
39
|
+
|
|
40
|
+
REPORTS_ASSETS = (PowerBiAsset.PAGES,)
|
|
@@ -1,3 +1,2 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .credentials import PowerbiCredentials
|
|
3
|
-
from .rest import Client
|
|
1
|
+
from .client import PowerbiClient
|
|
2
|
+
from .credentials import DEFAULT_SCOPE, PowerbiCredentials
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import msal # type: ignore
|
|
2
|
+
|
|
3
|
+
from ....utils import BearerAuth
|
|
4
|
+
from .constants import Keys
|
|
5
|
+
from .credentials import PowerbiCredentials
|
|
6
|
+
from .endpoints import PowerBiEndpointFactory
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PowerBiBearerAuth(BearerAuth):
|
|
10
|
+
def __init__(self, credentials: PowerbiCredentials):
|
|
11
|
+
self.credentials = credentials
|
|
12
|
+
authority = PowerBiEndpointFactory.authority(self.credentials.tenant_id)
|
|
13
|
+
self.app = msal.ConfidentialClientApplication(
|
|
14
|
+
client_id=self.credentials.client_id,
|
|
15
|
+
authority=authority,
|
|
16
|
+
client_credential=self.credentials.secret,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def fetch_token(self):
|
|
20
|
+
token = self.app.acquire_token_for_client(
|
|
21
|
+
scopes=self.credentials.scopes
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
if Keys.ACCESS_TOKEN not in token:
|
|
25
|
+
raise ValueError(f"No access token in token response: {token}")
|
|
26
|
+
|
|
27
|
+
return token[Keys.ACCESS_TOKEN]
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from datetime import date
|
|
3
|
+
from functools import partial
|
|
4
|
+
from time import sleep
|
|
5
|
+
from typing import Dict, Iterator, List, Optional, Union
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
from requests import HTTPError
|
|
9
|
+
|
|
10
|
+
from ....utils import (
|
|
11
|
+
APIClient,
|
|
12
|
+
fetch_all_pages,
|
|
13
|
+
)
|
|
14
|
+
from ..assets import PowerBiAsset
|
|
15
|
+
from .authentication import PowerBiBearerAuth
|
|
16
|
+
from .constants import Keys
|
|
17
|
+
from .credentials import PowerbiCredentials
|
|
18
|
+
from .endpoints import PowerBiEndpointFactory
|
|
19
|
+
from .pagination import PowerBiPagination
|
|
20
|
+
|
|
21
|
+
POWERBI_DEFAULT_TIMEOUT_S = 30
|
|
22
|
+
# The route we use to fetch workspaces info can retrieve a maximum of
|
|
23
|
+
# 100 workspaces per call
|
|
24
|
+
# More: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-post-workspace-info#request-body
|
|
25
|
+
METADATA_BATCH_SIZE = 100
|
|
26
|
+
POWERBI_SCAN_STATUS_DONE = "Succeeded"
|
|
27
|
+
POWERBI_SCAN_SLEEP_S = 1
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PowerbiClient(APIClient):
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
credentials: PowerbiCredentials,
|
|
36
|
+
):
|
|
37
|
+
auth = PowerBiBearerAuth(credentials=credentials)
|
|
38
|
+
super().__init__(
|
|
39
|
+
auth=auth,
|
|
40
|
+
timeout=POWERBI_DEFAULT_TIMEOUT_S,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def _activity_events(self, day: Optional[date] = None) -> Iterator[Dict]:
|
|
44
|
+
"""
|
|
45
|
+
Returns a list of activity events for the organization.
|
|
46
|
+
https://learn.microsoft.com/en-us/power-bi/admin/service-admin-auditing#activityevents-rest-api
|
|
47
|
+
- when no day is specified, fallback is yesterday
|
|
48
|
+
"""
|
|
49
|
+
request = partial(
|
|
50
|
+
self._get,
|
|
51
|
+
endpoint=PowerBiEndpointFactory.activity_events(day),
|
|
52
|
+
)
|
|
53
|
+
yield from fetch_all_pages(request, PowerBiPagination)
|
|
54
|
+
|
|
55
|
+
def _datasets(self) -> Iterator[Dict]:
|
|
56
|
+
"""
|
|
57
|
+
Returns a list of datasets for the organization.
|
|
58
|
+
https://learn.microsoft.com/en-us/rest/api/power-bi/admin/datasets-get-datasets-as-admin
|
|
59
|
+
"""
|
|
60
|
+
yield from self._get(PowerBiEndpointFactory.datasets())[Keys.VALUE]
|
|
61
|
+
|
|
62
|
+
def _dashboards(self) -> Iterator[Dict]:
|
|
63
|
+
"""
|
|
64
|
+
Returns a list of dashboards for the organization.
|
|
65
|
+
https://learn.microsoft.com/en-us/rest/api/power-bi/admin/dashboards-get-dashboards-as-admin
|
|
66
|
+
"""
|
|
67
|
+
yield from self._get(PowerBiEndpointFactory.dashboards())[Keys.VALUE]
|
|
68
|
+
|
|
69
|
+
def _reports(self) -> Iterator[Dict]:
|
|
70
|
+
"""
|
|
71
|
+
Returns a list of reports for the organization.
|
|
72
|
+
https://learn.microsoft.com/en-us/rest/api/power-bi/admin/reports-get-reports-as-admin
|
|
73
|
+
"""
|
|
74
|
+
reports_endpoint = PowerBiEndpointFactory.reports()
|
|
75
|
+
reports = self._get(reports_endpoint)[Keys.VALUE]
|
|
76
|
+
|
|
77
|
+
for report in reports:
|
|
78
|
+
report_id = report.get(Keys.ID)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
pages_endpoint = PowerBiEndpointFactory.pages(report_id)
|
|
82
|
+
pages = self._get(pages_endpoint)[Keys.VALUE]
|
|
83
|
+
report["pages"] = pages
|
|
84
|
+
except (requests.HTTPError, requests.exceptions.Timeout) as e:
|
|
85
|
+
logger.debug(e)
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
return reports
|
|
89
|
+
|
|
90
|
+
def _workspace_ids(self) -> List[str]:
|
|
91
|
+
"""
|
|
92
|
+
Get workspaces ids from powerBI admin API.
|
|
93
|
+
more: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-get-modified-workspaces
|
|
94
|
+
"""
|
|
95
|
+
params: Dict[str, Union[bool, str]] = {
|
|
96
|
+
Keys.INACTIVE_WORKSPACES: True,
|
|
97
|
+
Keys.PERSONAL_WORKSPACES: True,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
response = self._get(
|
|
101
|
+
PowerBiEndpointFactory.workspace_ids(),
|
|
102
|
+
params=params,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return [x[Keys.ID] for x in response]
|
|
106
|
+
|
|
107
|
+
def _get_scan_result(self, scan_id: int) -> Iterator[Dict]:
|
|
108
|
+
endpoint = PowerBiEndpointFactory.metadata_scan_result(scan_id)
|
|
109
|
+
yield from self._get(endpoint)[Keys.WORKSPACES]
|
|
110
|
+
|
|
111
|
+
def _wait_for_scan_result(self, scan_id: int) -> bool:
|
|
112
|
+
"""
|
|
113
|
+
Periodically checks the status of the metadata scan until the results
|
|
114
|
+
are ready.
|
|
115
|
+
"""
|
|
116
|
+
endpoint = PowerBiEndpointFactory.metadata_scan_status(scan_id)
|
|
117
|
+
total_waiting_time_s = 0
|
|
118
|
+
|
|
119
|
+
while total_waiting_time_s < POWERBI_DEFAULT_TIMEOUT_S:
|
|
120
|
+
try:
|
|
121
|
+
result = self._get(endpoint)
|
|
122
|
+
except HTTPError as e:
|
|
123
|
+
logger.error(f"Scan {scan_id} failed. Error: {e}")
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
if result[Keys.STATUS] == POWERBI_SCAN_STATUS_DONE:
|
|
127
|
+
logger.info(f"scan {scan_id} ready")
|
|
128
|
+
return True
|
|
129
|
+
|
|
130
|
+
total_waiting_time_s += POWERBI_SCAN_SLEEP_S
|
|
131
|
+
logger.info(
|
|
132
|
+
f"Waiting {POWERBI_SCAN_SLEEP_S} sec for scan {scan_id} to be ready…",
|
|
133
|
+
)
|
|
134
|
+
sleep(POWERBI_SCAN_SLEEP_S)
|
|
135
|
+
|
|
136
|
+
logger.warning(f"Scan {scan_id} timed out")
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
def _create_scan(self, workspaces_ids: List[str]) -> int:
|
|
140
|
+
"""
|
|
141
|
+
Tells the Power BI API to start an asynchronous metadata scan.
|
|
142
|
+
Returns the scan's ID.
|
|
143
|
+
https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-post-workspace-info
|
|
144
|
+
"""
|
|
145
|
+
params = {
|
|
146
|
+
"datasetExpressions": True,
|
|
147
|
+
"datasetSchema": True,
|
|
148
|
+
"datasourceDetails": True,
|
|
149
|
+
"getArtifactUsers": True,
|
|
150
|
+
"lineage": True,
|
|
151
|
+
}
|
|
152
|
+
request_body = {"workspaces": workspaces_ids}
|
|
153
|
+
scan_id = self._post(
|
|
154
|
+
PowerBiEndpointFactory.metadata_create_scan(),
|
|
155
|
+
params=params,
|
|
156
|
+
data=request_body,
|
|
157
|
+
)
|
|
158
|
+
return scan_id[Keys.ID]
|
|
159
|
+
|
|
160
|
+
def _metadata(self) -> Iterator[Dict]:
|
|
161
|
+
"""
|
|
162
|
+
Fetch metadata by workspace. The metadata scanning is asynchronous and
|
|
163
|
+
requires the following steps:
|
|
164
|
+
- create the asynchronous scan
|
|
165
|
+
- periodically check the scan status to know when it's finished
|
|
166
|
+
- get the actual scan results
|
|
167
|
+
https://learn.microsoft.com/en-us/power-bi/enterprise/service-admin-metadata-scanning
|
|
168
|
+
"""
|
|
169
|
+
ids = self._workspace_ids()
|
|
170
|
+
|
|
171
|
+
for index in range(0, len(ids), METADATA_BATCH_SIZE):
|
|
172
|
+
batch_ids = ids[index : index + METADATA_BATCH_SIZE]
|
|
173
|
+
scan_id = self._create_scan(batch_ids)
|
|
174
|
+
self._wait_for_scan_result(scan_id)
|
|
175
|
+
yield from self._get_scan_result(scan_id)
|
|
176
|
+
|
|
177
|
+
def test_connection(self) -> None:
|
|
178
|
+
"""Use credentials & verify requesting the API doesn't raise an error"""
|
|
179
|
+
self._auth.refresh_token()
|
|
180
|
+
|
|
181
|
+
def fetch(
|
|
182
|
+
self,
|
|
183
|
+
asset: PowerBiAsset,
|
|
184
|
+
*,
|
|
185
|
+
day: Optional[date] = None,
|
|
186
|
+
) -> Iterator[Dict]:
|
|
187
|
+
"""
|
|
188
|
+
Given a PowerBi asset, returns the corresponding data using the
|
|
189
|
+
appropriate client.
|
|
190
|
+
"""
|
|
191
|
+
if asset == PowerBiAsset.ACTIVITY_EVENTS:
|
|
192
|
+
yield from self._activity_events(day=day)
|
|
193
|
+
|
|
194
|
+
elif asset == PowerBiAsset.DATASETS:
|
|
195
|
+
yield from self._datasets()
|
|
196
|
+
|
|
197
|
+
elif asset == PowerBiAsset.DASHBOARDS:
|
|
198
|
+
yield from self._dashboards()
|
|
199
|
+
|
|
200
|
+
elif asset == PowerBiAsset.REPORTS:
|
|
201
|
+
yield from self._reports()
|
|
202
|
+
|
|
203
|
+
elif asset == PowerBiAsset.METADATA:
|
|
204
|
+
yield from self._metadata()
|
|
205
|
+
|
|
206
|
+
else:
|
|
207
|
+
raise ValueError(f"This asset {asset} is unknown")
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
from datetime import date
|
|
2
|
+
from unittest.mock import Mock, call, patch
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from .authentication import msal
|
|
7
|
+
from .client import PowerbiClient
|
|
8
|
+
from .constants import Keys
|
|
9
|
+
from .credentials import PowerbiCredentials
|
|
10
|
+
from .endpoints import PowerBiEndpointFactory
|
|
11
|
+
|
|
12
|
+
FAKE_TENANT_ID = "IamFake"
|
|
13
|
+
FAKE_CLIENT_ID = "MeTwo"
|
|
14
|
+
FAKE_SECRET = "MeThree"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.fixture
|
|
18
|
+
def mock_msal():
|
|
19
|
+
with patch.object(msal, "ConfidentialClientApplication") as mock_app:
|
|
20
|
+
mock_app.return_value.acquire_token_for_client.return_value = {
|
|
21
|
+
"access_token": "fake_token"
|
|
22
|
+
}
|
|
23
|
+
yield mock_app
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.fixture
|
|
27
|
+
def power_bi_client(mock_msal):
|
|
28
|
+
creds = PowerbiCredentials(
|
|
29
|
+
tenant_id=FAKE_TENANT_ID,
|
|
30
|
+
client_id=FAKE_CLIENT_ID,
|
|
31
|
+
secret=FAKE_SECRET,
|
|
32
|
+
)
|
|
33
|
+
return PowerbiClient(creds)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test__access_token(power_bi_client, mock_msal):
|
|
37
|
+
# Valid token scenario
|
|
38
|
+
valid_token = "mock_token"
|
|
39
|
+
mock_response = {"access_token": valid_token}
|
|
40
|
+
returning_valid_token = Mock(return_value=mock_response)
|
|
41
|
+
mock_msal.return_value.acquire_token_for_client = returning_valid_token
|
|
42
|
+
|
|
43
|
+
assert power_bi_client._auth.fetch_token() == valid_token
|
|
44
|
+
|
|
45
|
+
# Invalid token scenario
|
|
46
|
+
invalid_response = {"not_access_token": "666"}
|
|
47
|
+
returning_invalid_token = Mock(return_value=invalid_response)
|
|
48
|
+
mock_msal.return_value.acquire_token_for_client = returning_invalid_token
|
|
49
|
+
|
|
50
|
+
with pytest.raises(ValueError):
|
|
51
|
+
power_bi_client._auth.fetch_token()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test__datasets(power_bi_client):
|
|
55
|
+
with patch.object(power_bi_client, "_get") as mocked_get:
|
|
56
|
+
mocked_get.return_value = {"value": [{"id": 1, "type": "dataset"}]}
|
|
57
|
+
datasets = list(power_bi_client._datasets())
|
|
58
|
+
mocked_get.assert_called_with(PowerBiEndpointFactory.datasets())
|
|
59
|
+
assert datasets == [{"id": 1, "type": "dataset"}]
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test__dashboards(power_bi_client):
|
|
63
|
+
with patch.object(power_bi_client, "_get") as mocked_get:
|
|
64
|
+
mocked_get.return_value = {"value": [{"id": 1, "type": "dashboard"}]}
|
|
65
|
+
dashboards = list(power_bi_client._dashboards())
|
|
66
|
+
mocked_get.assert_called_with(PowerBiEndpointFactory.dashboards())
|
|
67
|
+
assert dashboards == [{"id": 1, "type": "dashboard"}]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test__reports(power_bi_client):
|
|
71
|
+
with patch.object(power_bi_client, "_get") as mocked_get:
|
|
72
|
+
mocked_get.side_effect = [
|
|
73
|
+
{"value": [{"id": 1, "type": "report"}]},
|
|
74
|
+
{
|
|
75
|
+
"value": [
|
|
76
|
+
{"name": "page_name", "displayName": "page", "order": 0}
|
|
77
|
+
]
|
|
78
|
+
},
|
|
79
|
+
]
|
|
80
|
+
reports = list(power_bi_client._reports())
|
|
81
|
+
calls = [
|
|
82
|
+
call(PowerBiEndpointFactory.reports()),
|
|
83
|
+
call(PowerBiEndpointFactory.pages("1")),
|
|
84
|
+
]
|
|
85
|
+
mocked_get.assert_has_calls(calls)
|
|
86
|
+
assert reports == [
|
|
87
|
+
{
|
|
88
|
+
"id": 1,
|
|
89
|
+
"type": "report",
|
|
90
|
+
"pages": [
|
|
91
|
+
{"name": "page_name", "displayName": "page", "order": 0}
|
|
92
|
+
],
|
|
93
|
+
}
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test__workspace_ids(power_bi_client):
|
|
98
|
+
with patch.object(power_bi_client, "_get") as mocked_get:
|
|
99
|
+
mocked_get.return_value = [{"id": 1000}, {"id": 1001}, {"id": 1003}]
|
|
100
|
+
|
|
101
|
+
ids = power_bi_client._workspace_ids()
|
|
102
|
+
assert ids == [1000, 1001, 1003]
|
|
103
|
+
|
|
104
|
+
params = {
|
|
105
|
+
Keys.INACTIVE_WORKSPACES: True,
|
|
106
|
+
Keys.PERSONAL_WORKSPACES: True,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
mocked_get.assert_called_with(
|
|
110
|
+
PowerBiEndpointFactory.workspace_ids(),
|
|
111
|
+
params=params,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@patch.object(PowerbiClient, "_get_scan_result")
|
|
116
|
+
@patch.object(PowerbiClient, "_wait_for_scan_result")
|
|
117
|
+
@patch.object(PowerbiClient, "_create_scan")
|
|
118
|
+
@patch.object(PowerbiClient, "_workspace_ids")
|
|
119
|
+
def test__metadata(
|
|
120
|
+
mock_workspace_ids,
|
|
121
|
+
mock_create_scan,
|
|
122
|
+
mock_wait_for_scan,
|
|
123
|
+
mock_get_scan_result,
|
|
124
|
+
power_bi_client,
|
|
125
|
+
):
|
|
126
|
+
mock_workspace_ids.return_value = list(range(200))
|
|
127
|
+
mock_create_scan.return_value = 314
|
|
128
|
+
mock_wait_for_scan.return_value = True
|
|
129
|
+
mock_get_scan_result.return_value = [{"workspace_id": 1871}]
|
|
130
|
+
|
|
131
|
+
result = list(power_bi_client._metadata())
|
|
132
|
+
|
|
133
|
+
assert result == [{"workspace_id": 1871}, {"workspace_id": 1871}]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def test__activity_events(power_bi_client):
|
|
137
|
+
day = date.today()
|
|
138
|
+
mocked_get_results = [
|
|
139
|
+
{
|
|
140
|
+
Keys.ACTIVITY_EVENT_ENTITIES: ["foo", "bar"],
|
|
141
|
+
Keys.LAST_RESULT_SET: False,
|
|
142
|
+
Keys.CONTINUATION_URI: "https://next-call-1",
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
Keys.ACTIVITY_EVENT_ENTITIES: ["baz"],
|
|
146
|
+
Keys.LAST_RESULT_SET: False,
|
|
147
|
+
Keys.CONTINUATION_URI: "https://next-call-2",
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
Keys.ACTIVITY_EVENT_ENTITIES: ["biz"],
|
|
151
|
+
Keys.LAST_RESULT_SET: True,
|
|
152
|
+
Keys.CONTINUATION_URI: None,
|
|
153
|
+
},
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
with patch.object(power_bi_client, "_get") as mocked_get:
|
|
157
|
+
mocked_get.side_effect = mocked_get_results
|
|
158
|
+
|
|
159
|
+
result = list(power_bi_client._activity_events(day=day))
|
|
160
|
+
assert result == ["foo", "bar", "baz", "biz"]
|
|
161
|
+
|
|
162
|
+
expected_calls = [
|
|
163
|
+
call(endpoint=PowerBiEndpointFactory.activity_events(day=day)),
|
|
164
|
+
call(endpoint="https://next-call-1"),
|
|
165
|
+
call(endpoint="https://next-call-2"),
|
|
166
|
+
]
|
|
167
|
+
mocked_get.assert_has_calls(expected_calls)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def test_test_connection(power_bi_client):
|
|
171
|
+
with patch.object(power_bi_client._auth, "refresh_token") as mock_refresh:
|
|
172
|
+
power_bi_client.test_connection()
|
|
173
|
+
mock_refresh.assert_called_once()
|
|
@@ -1,62 +1,3 @@
|
|
|
1
|
-
"""
|
|
2
|
-
File regrouping all constants used in PowerBi client
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
DEFAULT_TIMEOUT_IN_SECS = 30
|
|
6
|
-
SCAN_READY = "Succeeded"
|
|
7
|
-
# ModifiedSince params should not be older than 30 days
|
|
8
|
-
RECENT_DAYS = 30
|
|
9
|
-
|
|
10
|
-
GET = "GET"
|
|
11
|
-
POST = "POST"
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class Urls:
|
|
15
|
-
"""PowerBi's urls"""
|
|
16
|
-
|
|
17
|
-
CLIENT_APP_BASE = "https://login.microsoftonline.com/"
|
|
18
|
-
DEFAULT_SCOPE = "https://analysis.windows.net/powerbi/api/.default"
|
|
19
|
-
REST_API_BASE_PATH = "https://api.powerbi.com/v1.0/myorg"
|
|
20
|
-
|
|
21
|
-
# PBI rest API Routes
|
|
22
|
-
ACTIVITY_EVENTS = f"{REST_API_BASE_PATH}/admin/activityevents"
|
|
23
|
-
DASHBOARD = f"{REST_API_BASE_PATH}/admin/dashboards"
|
|
24
|
-
DATASETS = f"{REST_API_BASE_PATH}/admin/datasets"
|
|
25
|
-
GROUPS = f"{REST_API_BASE_PATH}/admin/groups"
|
|
26
|
-
METADATA_GET = f"{REST_API_BASE_PATH}/admin/workspaces/scanResult"
|
|
27
|
-
METADATA_POST = f"{REST_API_BASE_PATH}/admin/workspaces/getInfo"
|
|
28
|
-
METADATA_WAIT = f"{REST_API_BASE_PATH}/admin/workspaces/scanStatus"
|
|
29
|
-
REPORTS = f"{REST_API_BASE_PATH}/admin/reports"
|
|
30
|
-
WORKSPACE_IDS = (
|
|
31
|
-
"https://api.powerbi.com/v1.0/myorg/admin/workspaces/modified"
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class Batches:
|
|
36
|
-
"""Batches used within PowerBI api calls"""
|
|
37
|
-
|
|
38
|
-
DEFAULT = 100
|
|
39
|
-
# The route we use to fetch workspaces info can retrieve a maximum of
|
|
40
|
-
# 100 workspaces per call
|
|
41
|
-
# More: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-post-workspace-info#request-body
|
|
42
|
-
METADATA = 100
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
class QueryParams:
|
|
46
|
-
"""
|
|
47
|
-
Frequently used PowerBi query params
|
|
48
|
-
"""
|
|
49
|
-
|
|
50
|
-
METADATA_SCAN = {
|
|
51
|
-
"datasetExpressions": True,
|
|
52
|
-
"datasetSchema": True,
|
|
53
|
-
"datasourceDetails": True,
|
|
54
|
-
"getArtifactUsers": True,
|
|
55
|
-
"lineage": True,
|
|
56
|
-
}
|
|
57
|
-
ACTIVE_WORKSPACE_FILTER = "state eq 'Active' and type eq 'Workspace'"
|
|
58
|
-
|
|
59
|
-
|
|
60
1
|
class Keys:
|
|
61
2
|
ACCESS_TOKEN = "access_token" # noqa: S105
|
|
62
3
|
ACTIVITY_EVENT_ENTITIES = "activityEventEntities"
|
|
@@ -64,15 +5,7 @@ class Keys:
|
|
|
64
5
|
ID = "id"
|
|
65
6
|
INACTIVE_WORKSPACES = "excludeInActiveWorkspaces"
|
|
66
7
|
LAST_RESULT_SET = "lastResultSet"
|
|
67
|
-
MODIFIED_SINCE = "modifiedSince"
|
|
68
8
|
PERSONAL_WORKSPACES = "excludePersonalWorkspaces"
|
|
69
9
|
STATUS = "status"
|
|
70
10
|
VALUE = "value"
|
|
71
11
|
WORKSPACES = "workspaces"
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
class Assertions:
|
|
75
|
-
"""Assertion's messages"""
|
|
76
|
-
|
|
77
|
-
BATCH_TOO_BIG = f"Can not retrieve more than {Batches.METADATA} at the time"
|
|
78
|
-
DATETIME_TOO_OLD = "Date must be within 30 days range"
|