castor-extractor 0.19.8__py3-none-any.whl → 0.20.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +24 -0
- castor_extractor/commands/extract_tableau.py +8 -22
- castor_extractor/commands/extract_thoughtspot.py +18 -0
- castor_extractor/utils/__init__.py +1 -1
- castor_extractor/utils/client/api/client.py +14 -9
- castor_extractor/utils/client/api/pagination.py +2 -2
- castor_extractor/utils/client/api/safe_request.py +6 -3
- castor_extractor/utils/write.py +1 -1
- castor_extractor/visualization/looker/api/constants.py +0 -4
- castor_extractor/visualization/powerbi/__init__.py +1 -1
- castor_extractor/visualization/powerbi/assets.py +7 -1
- castor_extractor/visualization/powerbi/client/__init__.py +2 -3
- castor_extractor/visualization/powerbi/client/authentication.py +27 -0
- castor_extractor/visualization/powerbi/client/client.py +207 -0
- castor_extractor/visualization/powerbi/client/client_test.py +173 -0
- castor_extractor/visualization/powerbi/client/constants.py +0 -67
- castor_extractor/visualization/powerbi/client/credentials.py +3 -4
- castor_extractor/visualization/powerbi/client/credentials_test.py +3 -4
- castor_extractor/visualization/powerbi/client/endpoints.py +65 -0
- castor_extractor/visualization/powerbi/client/pagination.py +32 -0
- castor_extractor/visualization/powerbi/extract.py +14 -9
- castor_extractor/visualization/tableau_revamp/extract.py +10 -7
- castor_extractor/visualization/thoughtspot/__init__.py +3 -0
- castor_extractor/visualization/thoughtspot/assets.py +9 -0
- castor_extractor/visualization/thoughtspot/client/__init__.py +2 -0
- castor_extractor/visualization/thoughtspot/client/client.py +120 -0
- castor_extractor/visualization/thoughtspot/client/credentials.py +18 -0
- castor_extractor/visualization/thoughtspot/client/endpoints.py +12 -0
- castor_extractor/visualization/thoughtspot/client/utils.py +25 -0
- castor_extractor/visualization/thoughtspot/client/utils_test.py +57 -0
- castor_extractor/visualization/thoughtspot/extract.py +49 -0
- castor_extractor/warehouse/databricks/api_client.py +6 -0
- castor_extractor/warehouse/databricks/client.py +11 -5
- castor_extractor/warehouse/salesforce/client.py +1 -1
- castor_extractor/warehouse/salesforce/format.py +40 -30
- castor_extractor/warehouse/salesforce/format_test.py +61 -24
- castor_extractor/warehouse/sqlserver/client.py +2 -2
- {castor_extractor-0.19.8.dist-info → castor_extractor-0.20.4.dist-info}/METADATA +25 -1
- {castor_extractor-0.19.8.dist-info → castor_extractor-0.20.4.dist-info}/RECORD +42 -31
- {castor_extractor-0.19.8.dist-info → castor_extractor-0.20.4.dist-info}/entry_points.txt +1 -0
- castor_extractor/visualization/powerbi/client/rest.py +0 -305
- castor_extractor/visualization/powerbi/client/rest_test.py +0 -290
- castor_extractor/visualization/powerbi/client/utils.py +0 -19
- castor_extractor/visualization/powerbi/client/utils_test.py +0 -24
- {castor_extractor-0.19.8.dist-info → castor_extractor-0.20.4.dist-info}/LICENCE +0 -0
- {castor_extractor-0.19.8.dist-info → castor_extractor-0.20.4.dist-info}/WHEEL +0 -0
|
@@ -1,62 +1,3 @@
|
|
|
1
|
-
"""
|
|
2
|
-
File regrouping all constants used in PowerBi client
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
DEFAULT_TIMEOUT_IN_SECS = 30
|
|
6
|
-
SCAN_READY = "Succeeded"
|
|
7
|
-
# ModifiedSince params should not be older than 30 days
|
|
8
|
-
RECENT_DAYS = 30
|
|
9
|
-
|
|
10
|
-
GET = "GET"
|
|
11
|
-
POST = "POST"
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class Urls:
|
|
15
|
-
"""PowerBi's urls"""
|
|
16
|
-
|
|
17
|
-
CLIENT_APP_BASE = "https://login.microsoftonline.com/"
|
|
18
|
-
DEFAULT_SCOPE = "https://analysis.windows.net/powerbi/api/.default"
|
|
19
|
-
REST_API_BASE_PATH = "https://api.powerbi.com/v1.0/myorg"
|
|
20
|
-
|
|
21
|
-
# PBI rest API Routes
|
|
22
|
-
ACTIVITY_EVENTS = f"{REST_API_BASE_PATH}/admin/activityevents"
|
|
23
|
-
DASHBOARD = f"{REST_API_BASE_PATH}/admin/dashboards"
|
|
24
|
-
DATASETS = f"{REST_API_BASE_PATH}/admin/datasets"
|
|
25
|
-
GROUPS = f"{REST_API_BASE_PATH}/admin/groups"
|
|
26
|
-
METADATA_GET = f"{REST_API_BASE_PATH}/admin/workspaces/scanResult"
|
|
27
|
-
METADATA_POST = f"{REST_API_BASE_PATH}/admin/workspaces/getInfo"
|
|
28
|
-
METADATA_WAIT = f"{REST_API_BASE_PATH}/admin/workspaces/scanStatus"
|
|
29
|
-
REPORTS = f"{REST_API_BASE_PATH}/admin/reports"
|
|
30
|
-
WORKSPACE_IDS = (
|
|
31
|
-
"https://api.powerbi.com/v1.0/myorg/admin/workspaces/modified"
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class Batches:
|
|
36
|
-
"""Batches used within PowerBI api calls"""
|
|
37
|
-
|
|
38
|
-
DEFAULT = 100
|
|
39
|
-
# The route we use to fetch workspaces info can retrieve a maximum of
|
|
40
|
-
# 100 workspaces per call
|
|
41
|
-
# More: https://learn.microsoft.com/en-us/rest/api/power-bi/admin/workspace-info-post-workspace-info#request-body
|
|
42
|
-
METADATA = 100
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
class QueryParams:
|
|
46
|
-
"""
|
|
47
|
-
Frequently used PowerBi query params
|
|
48
|
-
"""
|
|
49
|
-
|
|
50
|
-
METADATA_SCAN = {
|
|
51
|
-
"datasetExpressions": True,
|
|
52
|
-
"datasetSchema": True,
|
|
53
|
-
"datasourceDetails": True,
|
|
54
|
-
"getArtifactUsers": True,
|
|
55
|
-
"lineage": True,
|
|
56
|
-
}
|
|
57
|
-
ACTIVE_WORKSPACE_FILTER = "state eq 'Active' and type eq 'Workspace'"
|
|
58
|
-
|
|
59
|
-
|
|
60
1
|
class Keys:
|
|
61
2
|
ACCESS_TOKEN = "access_token" # noqa: S105
|
|
62
3
|
ACTIVITY_EVENT_ENTITIES = "activityEventEntities"
|
|
@@ -64,15 +5,7 @@ class Keys:
|
|
|
64
5
|
ID = "id"
|
|
65
6
|
INACTIVE_WORKSPACES = "excludeInActiveWorkspaces"
|
|
66
7
|
LAST_RESULT_SET = "lastResultSet"
|
|
67
|
-
MODIFIED_SINCE = "modifiedSince"
|
|
68
8
|
PERSONAL_WORKSPACES = "excludePersonalWorkspaces"
|
|
69
9
|
STATUS = "status"
|
|
70
10
|
VALUE = "value"
|
|
71
11
|
WORKSPACES = "workspaces"
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
class Assertions:
|
|
75
|
-
"""Assertion's messages"""
|
|
76
|
-
|
|
77
|
-
BATCH_TOO_BIG = f"Can not retrieve more than {Batches.METADATA} at the time"
|
|
78
|
-
DATETIME_TOO_OLD = "Date must be within 30 days range"
|
|
@@ -3,8 +3,7 @@ from typing import List, Optional
|
|
|
3
3
|
from pydantic import Field, field_validator
|
|
4
4
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
DEFAULT_SCOPE = "https://analysis.windows.net/powerbi/api/.default"
|
|
8
7
|
POWERBI_ENV_PREFIX = "CASTOR_POWERBI_"
|
|
9
8
|
|
|
10
9
|
|
|
@@ -20,9 +19,9 @@ class PowerbiCredentials(BaseSettings):
|
|
|
20
19
|
client_id: str
|
|
21
20
|
tenant_id: str
|
|
22
21
|
secret: str = Field(repr=False)
|
|
23
|
-
scopes: List[str] = [
|
|
22
|
+
scopes: List[str] = [DEFAULT_SCOPE]
|
|
24
23
|
|
|
25
24
|
@field_validator("scopes", mode="before")
|
|
26
25
|
@classmethod
|
|
27
26
|
def _check_scopes(cls, scopes: Optional[List[str]]) -> List[str]:
|
|
28
|
-
return scopes if scopes is not None else [
|
|
27
|
+
return scopes if scopes is not None else [DEFAULT_SCOPE]
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
from .
|
|
2
|
-
from .credentials import PowerbiCredentials
|
|
1
|
+
from .credentials import DEFAULT_SCOPE, PowerbiCredentials
|
|
3
2
|
|
|
4
3
|
|
|
5
4
|
def test_credentials():
|
|
@@ -13,7 +12,7 @@ def test_credentials():
|
|
|
13
12
|
client_id=client_id,
|
|
14
13
|
secret=secret,
|
|
15
14
|
)
|
|
16
|
-
assert credentials.scopes == [
|
|
15
|
+
assert credentials.scopes == [DEFAULT_SCOPE]
|
|
17
16
|
|
|
18
17
|
credentials = PowerbiCredentials(
|
|
19
18
|
tenant_id=tenant_id,
|
|
@@ -21,7 +20,7 @@ def test_credentials():
|
|
|
21
20
|
secret=secret,
|
|
22
21
|
scopes=None,
|
|
23
22
|
)
|
|
24
|
-
assert credentials.scopes == [
|
|
23
|
+
assert credentials.scopes == [DEFAULT_SCOPE]
|
|
25
24
|
|
|
26
25
|
# empty scopes
|
|
27
26
|
credentials = PowerbiCredentials(
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from datetime import date, datetime
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
|
+
|
|
4
|
+
from ....utils import at_midnight, format_date, yesterday
|
|
5
|
+
|
|
6
|
+
_CLIENT_APP_BASE = "https://login.microsoftonline.com"
|
|
7
|
+
_REST_API_BASE_PATH = "https://api.powerbi.com/v1.0/myorg"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _time_filter(day: Optional[date]) -> Tuple[datetime, datetime]:
|
|
11
|
+
target_day = day or yesterday()
|
|
12
|
+
start = at_midnight(target_day)
|
|
13
|
+
end = datetime.combine(target_day, datetime.max.time())
|
|
14
|
+
return start, end
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PowerBiEndpointFactory:
|
|
18
|
+
@classmethod
|
|
19
|
+
def activity_events(cls, day: Optional[date]) -> str:
|
|
20
|
+
start, end = _time_filter(day)
|
|
21
|
+
url = f"{_REST_API_BASE_PATH}/admin/activityevents"
|
|
22
|
+
url += "?$filter=Activity eq 'viewreport'"
|
|
23
|
+
url += f"&startDateTime='{format_date(start)}'"
|
|
24
|
+
url += f"&endDateTime='{format_date(end)}'"
|
|
25
|
+
return url
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def authority(cls, tenant_id: str) -> str:
|
|
29
|
+
return f"{_CLIENT_APP_BASE}/{tenant_id}"
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def dashboards(cls) -> str:
|
|
33
|
+
return f"{_REST_API_BASE_PATH}/admin/dashboards"
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def datasets(cls) -> str:
|
|
37
|
+
return f"{_REST_API_BASE_PATH}/admin/datasets"
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def groups(cls) -> str:
|
|
41
|
+
return f"{_REST_API_BASE_PATH}/admin/groups"
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def metadata_create_scan(cls) -> str:
|
|
45
|
+
return f"{_REST_API_BASE_PATH}/admin/workspaces/getInfo"
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def metadata_scan_result(cls, scan_id: int) -> str:
|
|
49
|
+
return f"{_REST_API_BASE_PATH}/admin/workspaces/scanResult/{scan_id}"
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def metadata_scan_status(cls, scan_id: int) -> str:
|
|
53
|
+
return f"{_REST_API_BASE_PATH}/admin/workspaces/scanStatus/{scan_id}"
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def pages(cls, report_id: str) -> str:
|
|
57
|
+
return f"{_REST_API_BASE_PATH}/admin/reports/{report_id}/pages"
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def reports(cls) -> str:
|
|
61
|
+
return f"{_REST_API_BASE_PATH}/admin/reports"
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def workspace_ids(cls) -> str:
|
|
65
|
+
return f"{_REST_API_BASE_PATH}/admin/workspaces/modified"
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import ConfigDict
|
|
4
|
+
from pydantic.alias_generators import to_camel
|
|
5
|
+
|
|
6
|
+
from ....utils import (
|
|
7
|
+
FetchNextPageBy,
|
|
8
|
+
PaginationModel,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PowerBiPagination(PaginationModel):
|
|
13
|
+
model_config = ConfigDict(
|
|
14
|
+
alias_generator=to_camel,
|
|
15
|
+
populate_by_name=True,
|
|
16
|
+
from_attributes=True,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
fetch_by: FetchNextPageBy = FetchNextPageBy.URL
|
|
20
|
+
|
|
21
|
+
activity_event_entities: list
|
|
22
|
+
continuation_uri: Optional[str] = None
|
|
23
|
+
last_result_set: bool = False
|
|
24
|
+
|
|
25
|
+
def is_last(self) -> bool:
|
|
26
|
+
return self.last_result_set
|
|
27
|
+
|
|
28
|
+
def next_page_payload(self) -> Optional[str]:
|
|
29
|
+
return self.continuation_uri
|
|
30
|
+
|
|
31
|
+
def page_results(self) -> list:
|
|
32
|
+
return self.activity_event_entities
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Iterable, Tuple, Union
|
|
2
3
|
|
|
3
4
|
from ...utils import (
|
|
4
5
|
OUTPUT_DIR,
|
|
@@ -9,19 +10,23 @@ from ...utils import (
|
|
|
9
10
|
write_json,
|
|
10
11
|
write_summary,
|
|
11
12
|
)
|
|
12
|
-
from .assets import METADATA_ASSETS, PowerBiAsset
|
|
13
|
-
from .client import
|
|
13
|
+
from .assets import METADATA_ASSETS, REPORTS_ASSETS, PowerBiAsset
|
|
14
|
+
from .client import PowerbiClient, PowerbiCredentials
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
14
17
|
|
|
15
18
|
|
|
16
19
|
def iterate_all_data(
|
|
17
|
-
client:
|
|
18
|
-
) -> Iterable[Tuple[PowerBiAsset, Union[
|
|
20
|
+
client: PowerbiClient,
|
|
21
|
+
) -> Iterable[Tuple[PowerBiAsset, Union[list, dict]]]:
|
|
19
22
|
for asset in PowerBiAsset:
|
|
20
|
-
if asset in METADATA_ASSETS:
|
|
23
|
+
if asset in METADATA_ASSETS + REPORTS_ASSETS:
|
|
21
24
|
continue
|
|
22
25
|
|
|
23
|
-
|
|
24
|
-
|
|
26
|
+
logger.info(f"Extracting {asset.name} from API")
|
|
27
|
+
data = list(deep_serialize(client.fetch(asset)))
|
|
28
|
+
yield asset, data
|
|
29
|
+
logger.info(f"Extracted {len(data)} {asset.name} from API")
|
|
25
30
|
|
|
26
31
|
|
|
27
32
|
def extract_all(**kwargs) -> None:
|
|
@@ -31,7 +36,7 @@ def extract_all(**kwargs) -> None:
|
|
|
31
36
|
"""
|
|
32
37
|
_output_directory = kwargs.get("output") or from_env(OUTPUT_DIR)
|
|
33
38
|
creds = PowerbiCredentials(**kwargs)
|
|
34
|
-
client =
|
|
39
|
+
client = PowerbiClient(creds)
|
|
35
40
|
ts = current_timestamp()
|
|
36
41
|
|
|
37
42
|
for key, data in iterate_all_data(client):
|
|
@@ -11,7 +11,7 @@ from ...utils import (
|
|
|
11
11
|
write_summary,
|
|
12
12
|
)
|
|
13
13
|
from .assets import TableauRevampAsset
|
|
14
|
-
from .client import TableauRevampClient
|
|
14
|
+
from .client import TableauRevampClient, TableauRevampCredentials
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
@@ -26,16 +26,19 @@ def iterate_all_data(
|
|
|
26
26
|
yield asset, deep_serialize(data)
|
|
27
27
|
|
|
28
28
|
|
|
29
|
-
def extract_all(
|
|
29
|
+
def extract_all(**kwargs) -> None:
|
|
30
30
|
"""
|
|
31
|
-
Extract Data
|
|
32
|
-
|
|
33
|
-
If errors from Tableau's API are catch store them locally in file under the output_directory
|
|
31
|
+
Extract Data From tableau and store it locally in files under the
|
|
32
|
+
output_directory
|
|
34
33
|
"""
|
|
35
|
-
output_directory = kwargs.get("
|
|
36
|
-
|
|
34
|
+
output_directory = kwargs.get("output") or from_env(OUTPUT_DIR)
|
|
35
|
+
with_pulse = kwargs.get("with_pulse") or False
|
|
37
36
|
timestamp = current_timestamp()
|
|
38
37
|
|
|
38
|
+
credentials = TableauRevampCredentials(**kwargs)
|
|
39
|
+
client = TableauRevampClient(credentials, with_pulse=with_pulse)
|
|
40
|
+
client.login()
|
|
41
|
+
|
|
39
42
|
for key, data in iterate_all_data(client):
|
|
40
43
|
filename = get_output_filename(key.value, output_directory, timestamp)
|
|
41
44
|
write_json(filename, data)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from typing import Dict, Iterator, Optional
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
from ....utils import (
|
|
6
|
+
APIClient,
|
|
7
|
+
BearerAuth,
|
|
8
|
+
RequestSafeMode,
|
|
9
|
+
build_url,
|
|
10
|
+
handle_response,
|
|
11
|
+
)
|
|
12
|
+
from ..assets import (
|
|
13
|
+
ThoughtspotAsset,
|
|
14
|
+
)
|
|
15
|
+
from .credentials import (
|
|
16
|
+
ThoughtspotCredentials,
|
|
17
|
+
)
|
|
18
|
+
from .endpoints import (
|
|
19
|
+
ThoughtspotEndpointFactory,
|
|
20
|
+
)
|
|
21
|
+
from .utils import (
|
|
22
|
+
usage_liveboard_reader,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
_AUTH_TIMEOUT_S = 60
|
|
26
|
+
_THOUGHTSPOT_HEADERS = {
|
|
27
|
+
"X-Requested-By": "ThoughtSpot",
|
|
28
|
+
"Accept": "application/json",
|
|
29
|
+
"Content-Type": "application/json",
|
|
30
|
+
}
|
|
31
|
+
_METADATA_BATCH_SIZE = 100
|
|
32
|
+
_USAGE_LIVEBOARD_ID = "bea79810-145f-4ad0-a02c-4177a6e7d861"
|
|
33
|
+
# By default, no errors are ignored for the moment
|
|
34
|
+
THOUGHTSPOT_SAFE_MODE = RequestSafeMode()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ThoughtspotBearerAuth(BearerAuth):
|
|
38
|
+
def __init__(self, host: str, token_payload: Dict[str, str]):
|
|
39
|
+
auth_endpoint = ThoughtspotEndpointFactory.authentication()
|
|
40
|
+
self.authentication_url = build_url(host, auth_endpoint)
|
|
41
|
+
self.token_payload = token_payload
|
|
42
|
+
|
|
43
|
+
def fetch_token(self):
|
|
44
|
+
token_api_path = self.authentication_url
|
|
45
|
+
token_response = requests.post(
|
|
46
|
+
token_api_path, data=self.token_payload, timeout=_AUTH_TIMEOUT_S
|
|
47
|
+
)
|
|
48
|
+
return handle_response(token_response)["token"]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ThoughtspotClient(APIClient):
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
credentials: ThoughtspotCredentials,
|
|
55
|
+
safe_mode: Optional[RequestSafeMode] = None,
|
|
56
|
+
):
|
|
57
|
+
auth = ThoughtspotBearerAuth(
|
|
58
|
+
host=credentials.base_url,
|
|
59
|
+
token_payload=credentials.dict(),
|
|
60
|
+
)
|
|
61
|
+
super().__init__(
|
|
62
|
+
host=credentials.base_url,
|
|
63
|
+
auth=auth,
|
|
64
|
+
headers=_THOUGHTSPOT_HEADERS,
|
|
65
|
+
safe_mode=safe_mode or THOUGHTSPOT_SAFE_MODE,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def _metadata_search(
|
|
69
|
+
self,
|
|
70
|
+
metadata_type: str,
|
|
71
|
+
) -> Iterator[dict]:
|
|
72
|
+
offset = 0
|
|
73
|
+
while True:
|
|
74
|
+
metadata = self._post(
|
|
75
|
+
ThoughtspotEndpointFactory.metadata_search(),
|
|
76
|
+
data={
|
|
77
|
+
"metadata": [{"type": metadata_type}],
|
|
78
|
+
"include_details": True,
|
|
79
|
+
"record_size": _METADATA_BATCH_SIZE,
|
|
80
|
+
"record_offset": offset,
|
|
81
|
+
},
|
|
82
|
+
)
|
|
83
|
+
yield from metadata
|
|
84
|
+
if len(metadata) < _METADATA_BATCH_SIZE:
|
|
85
|
+
break
|
|
86
|
+
offset = offset + _METADATA_BATCH_SIZE
|
|
87
|
+
|
|
88
|
+
def _get_all_liveboards(self) -> Iterator[dict]:
|
|
89
|
+
yield from self._metadata_search(metadata_type="LIVEBOARD")
|
|
90
|
+
|
|
91
|
+
def _get_all_columns(self) -> Iterator[dict]:
|
|
92
|
+
yield from self._metadata_search(metadata_type="LOGICAL_COLUMN")
|
|
93
|
+
|
|
94
|
+
def _get_all_tables(self) -> Iterator[dict]:
|
|
95
|
+
yield from self._metadata_search(metadata_type="LOGICAL_TABLE")
|
|
96
|
+
|
|
97
|
+
def _get_liveboards_usages(self) -> Iterator[dict]:
|
|
98
|
+
data = self._post(
|
|
99
|
+
endpoint=ThoughtspotEndpointFactory.liveboard(),
|
|
100
|
+
headers={"Accept": "application/octet-stream"},
|
|
101
|
+
data={
|
|
102
|
+
"metadata_identifier": _USAGE_LIVEBOARD_ID,
|
|
103
|
+
"file_format": "CSV",
|
|
104
|
+
"visualization_identifiers": [
|
|
105
|
+
"Popular Liveboards Last 30 Days"
|
|
106
|
+
],
|
|
107
|
+
},
|
|
108
|
+
handler=lambda x: x.text,
|
|
109
|
+
)
|
|
110
|
+
yield from usage_liveboard_reader(data)
|
|
111
|
+
|
|
112
|
+
def fetch(self, asset: ThoughtspotAsset):
|
|
113
|
+
if asset == ThoughtspotAsset.LIVEBOARDS:
|
|
114
|
+
yield from self._get_all_liveboards()
|
|
115
|
+
|
|
116
|
+
if asset == ThoughtspotAsset.USAGES:
|
|
117
|
+
yield from self._get_liveboards_usages()
|
|
118
|
+
|
|
119
|
+
if asset == ThoughtspotAsset.LOGICAL_TABLES:
|
|
120
|
+
yield from self._get_all_tables()
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from pydantic import Field
|
|
2
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
3
|
+
|
|
4
|
+
THOUGHTSPOT_ENV_PREFIX = "CASTOR_THOUGHTSPOT_"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class ThoughtspotCredentials(BaseSettings):
|
|
8
|
+
"""Class to handle Thoughtspot rest API permissions"""
|
|
9
|
+
|
|
10
|
+
model_config = SettingsConfigDict(
|
|
11
|
+
env_prefix=THOUGHTSPOT_ENV_PREFIX,
|
|
12
|
+
extra="ignore",
|
|
13
|
+
populate_by_name=True,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
username: str
|
|
17
|
+
password: str = Field(repr=False)
|
|
18
|
+
base_url: str
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
class ThoughtspotEndpointFactory:
|
|
2
|
+
@classmethod
|
|
3
|
+
def authentication(cls) -> str:
|
|
4
|
+
return "api/rest/2.0/auth/token/full"
|
|
5
|
+
|
|
6
|
+
@classmethod
|
|
7
|
+
def metadata_search(cls) -> str:
|
|
8
|
+
return "api/rest/2.0/metadata/search"
|
|
9
|
+
|
|
10
|
+
@classmethod
|
|
11
|
+
def liveboard(cls) -> str:
|
|
12
|
+
return "api/rest/2.0/report/liveboard"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
from io import StringIO
|
|
3
|
+
from typing import Iterator
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def usage_liveboard_reader(usage_liveboard_csv: str) -> Iterator[dict]:
|
|
7
|
+
"""
|
|
8
|
+
Converts a CSV string into an iterator of dictionaries after
|
|
9
|
+
ignoring the first 6 lines, using the 7th line as the header.
|
|
10
|
+
First 6 lines looks like the following:
|
|
11
|
+
|
|
12
|
+
"Data extract produced by Castor on 09/19/2024 06:54"
|
|
13
|
+
"Filters applied on data :"
|
|
14
|
+
"User Action IN [pinboard_embed_view,pinboard_tspublic_no_runtime_filter,pinboard_tspublic_runtime_filter,pinboard_view]"
|
|
15
|
+
"Pinboard NOT IN [mlm - availability pinboard,null]"
|
|
16
|
+
"Timestamp >= 20240820 00:00:00 < 20240919 00:00:00"
|
|
17
|
+
"Timestamp >= 20240919 00:00:00 < 20240920 00:00:00"
|
|
18
|
+
|
|
19
|
+
"""
|
|
20
|
+
csv_file = StringIO(usage_liveboard_csv)
|
|
21
|
+
|
|
22
|
+
for _ in range(7):
|
|
23
|
+
next(csv_file)
|
|
24
|
+
|
|
25
|
+
yield from csv.DictReader(csv_file)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from .utils import (
|
|
2
|
+
usage_liveboard_reader,
|
|
3
|
+
)
|
|
4
|
+
|
|
5
|
+
VALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
|
|
6
|
+
"Filters applied on data :"
|
|
7
|
+
"User Action IN [pinboard_embed_view,pinboard_tspublic_no_runtime_filter,pinboard_tspublic_runtime_filter,pinboard_view]"
|
|
8
|
+
"Pinboard NOT IN [mlm - availability pinboard,null]"
|
|
9
|
+
"Timestamp >= 20240820 00:00:00 < 20240919 00:00:00"
|
|
10
|
+
"Timestamp >= 20240919 00:00:00 < 20240920 00:00:00"
|
|
11
|
+
""
|
|
12
|
+
"Pinboard","Pinboard Views","Unique Number of User"
|
|
13
|
+
"Market Report","559","19"
|
|
14
|
+
"Retailer report","204","14"
|
|
15
|
+
"Second-hand market","72","6"
|
|
16
|
+
"September test","25","2"'''
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# Invalid CSV input (missing data rows)
|
|
20
|
+
INVALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
|
|
21
|
+
"Filters applied on data :"
|
|
22
|
+
"User Action IN [pinboard_embed_view,pinboard_tspublic_no_runtime_filter,pinboard_tspublic_runtime_filter,pinboard_view]"
|
|
23
|
+
"Pinboard NOT IN [mlm - availability pinboard,null]"
|
|
24
|
+
"Timestamp >= 20240820 00:00:00 < 20240919 00:00:00"
|
|
25
|
+
"Timestamp >= 20240919 00:00:00 < 20240920 00:00:00"
|
|
26
|
+
""'''
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_usage_liveboard_reader():
|
|
30
|
+
expected_output = [
|
|
31
|
+
{
|
|
32
|
+
"Pinboard": "Market Report",
|
|
33
|
+
"Pinboard Views": "559",
|
|
34
|
+
"Unique Number of User": "19",
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"Pinboard": "Retailer report",
|
|
38
|
+
"Pinboard Views": "204",
|
|
39
|
+
"Unique Number of User": "14",
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"Pinboard": "Second-hand market",
|
|
43
|
+
"Pinboard Views": "72",
|
|
44
|
+
"Unique Number of User": "6",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"Pinboard": "September test",
|
|
48
|
+
"Pinboard Views": "25",
|
|
49
|
+
"Unique Number of User": "2",
|
|
50
|
+
},
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
result = list(usage_liveboard_reader(VALID_CSV))
|
|
54
|
+
assert result == expected_output
|
|
55
|
+
|
|
56
|
+
result = list(usage_liveboard_reader(INVALID_CSV))
|
|
57
|
+
assert result == [] # Expect an empty result since there is no data
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Iterable, Iterator, Tuple, Union
|
|
3
|
+
|
|
4
|
+
from ...utils import (
|
|
5
|
+
OUTPUT_DIR,
|
|
6
|
+
current_timestamp,
|
|
7
|
+
deep_serialize,
|
|
8
|
+
from_env,
|
|
9
|
+
get_output_filename,
|
|
10
|
+
write_json,
|
|
11
|
+
write_summary,
|
|
12
|
+
)
|
|
13
|
+
from .assets import ThoughtspotAsset
|
|
14
|
+
from .client import (
|
|
15
|
+
ThoughtspotClient,
|
|
16
|
+
ThoughtspotCredentials,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def iterate_all_data(
|
|
23
|
+
client: ThoughtspotClient,
|
|
24
|
+
) -> Iterable[Tuple[ThoughtspotAsset, Union[list, Iterator, dict]]]:
|
|
25
|
+
"""Iterate over the extracted data from Thoughtspot"""
|
|
26
|
+
|
|
27
|
+
for asset in ThoughtspotAsset:
|
|
28
|
+
logger.info(f"Extracting {asset.value} from API")
|
|
29
|
+
data = client.fetch(asset)
|
|
30
|
+
yield asset, deep_serialize(data)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def extract_all(**kwargs) -> None:
|
|
34
|
+
"""
|
|
35
|
+
Extract data from Thoughtspot API
|
|
36
|
+
Store the output files locally under the given output_directory
|
|
37
|
+
"""
|
|
38
|
+
_output_directory = kwargs.get("output") or from_env(OUTPUT_DIR)
|
|
39
|
+
|
|
40
|
+
credentials = ThoughtspotCredentials(**kwargs)
|
|
41
|
+
client = ThoughtspotClient(credentials=credentials)
|
|
42
|
+
|
|
43
|
+
ts = current_timestamp()
|
|
44
|
+
|
|
45
|
+
for key, data in iterate_all_data(client):
|
|
46
|
+
filename = get_output_filename(key.name.lower(), _output_directory, ts)
|
|
47
|
+
write_json(filename, list(data))
|
|
48
|
+
|
|
49
|
+
write_summary(_output_directory, ts, host=credentials.base_url)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from functools import partial
|
|
3
|
+
from http import HTTPStatus
|
|
3
4
|
from typing import Iterator, List, Optional, Set, Tuple
|
|
4
5
|
|
|
5
6
|
import requests
|
|
@@ -12,6 +13,7 @@ from ...utils import (
|
|
|
12
13
|
fetch_all_pages,
|
|
13
14
|
handle_response,
|
|
14
15
|
retry,
|
|
16
|
+
retry_request,
|
|
15
17
|
safe_mode,
|
|
16
18
|
)
|
|
17
19
|
from ..abstract import TimeFilter
|
|
@@ -135,6 +137,10 @@ class DatabricksAPIClient(APIClient):
|
|
|
135
137
|
max_retries=_RETRY_ATTEMPTS,
|
|
136
138
|
base_ms=_RETRY_BASE_MS,
|
|
137
139
|
)
|
|
140
|
+
@retry_request(
|
|
141
|
+
status_codes=(HTTPStatus.TOO_MANY_REQUESTS,),
|
|
142
|
+
max_retries=_RETRY_ATTEMPTS,
|
|
143
|
+
)
|
|
138
144
|
def get_single_column_lineage(
|
|
139
145
|
self,
|
|
140
146
|
names: Tuple[str, str],
|
|
@@ -15,7 +15,8 @@ from .types import TablesColumns, TimestampedLink
|
|
|
15
15
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
_THREADS_COLUMN_LINEAGE = 2
|
|
19
|
+
_THREADS_TABLE_LINEAGE = 10
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class DatabricksClient:
|
|
@@ -99,7 +100,7 @@ class DatabricksClient:
|
|
|
99
100
|
Wrapper function that retrieves all table lineage
|
|
100
101
|
"""
|
|
101
102
|
# retrieve table lineage
|
|
102
|
-
with ThreadPoolExecutor(max_workers=
|
|
103
|
+
with ThreadPoolExecutor(max_workers=_THREADS_TABLE_LINEAGE) as executor:
|
|
103
104
|
table_paths = [
|
|
104
105
|
".".join([table["schema_id"], table["table_name"]])
|
|
105
106
|
for table in tables
|
|
@@ -121,10 +122,15 @@ class DatabricksClient:
|
|
|
121
122
|
candidate_paths = paths_for_column_lineage(
|
|
122
123
|
tables, columns, table_lineage
|
|
123
124
|
)
|
|
125
|
+
# retrieve column lineage
|
|
126
|
+
with ThreadPoolExecutor(
|
|
127
|
+
max_workers=_THREADS_COLUMN_LINEAGE
|
|
128
|
+
) as executor:
|
|
129
|
+
results = executor.map(
|
|
130
|
+
self.api_client.get_single_column_lineage, candidate_paths
|
|
131
|
+
)
|
|
124
132
|
lineages: List[TimestampedLink] = [
|
|
125
|
-
link
|
|
126
|
-
for paths in candidate_paths
|
|
127
|
-
for link in self.api_client.get_single_column_lineage(paths)
|
|
133
|
+
link for links in results for link in links
|
|
128
134
|
]
|
|
129
135
|
deduplicated = deduplicate_lineage(lineages)
|
|
130
136
|
return self.formatter.format_lineage(deduplicated)
|