castor-extractor 0.22.0__py3-none-any.whl → 0.22.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (38) hide show
  1. CHANGELOG.md +20 -0
  2. castor_extractor/utils/__init__.py +1 -0
  3. castor_extractor/utils/time.py +4 -0
  4. castor_extractor/utils/time_test.py +8 -1
  5. castor_extractor/visualization/looker_studio/__init__.py +6 -0
  6. castor_extractor/visualization/looker_studio/assets.py +6 -0
  7. castor_extractor/visualization/looker_studio/client/__init__.py +3 -0
  8. castor_extractor/visualization/looker_studio/client/admin_sdk_client.py +90 -0
  9. castor_extractor/visualization/looker_studio/client/client.py +37 -0
  10. castor_extractor/visualization/looker_studio/client/credentials.py +20 -0
  11. castor_extractor/visualization/looker_studio/client/endpoints.py +18 -0
  12. castor_extractor/visualization/looker_studio/client/enums.py +8 -0
  13. castor_extractor/visualization/looker_studio/client/looker_studio_api_client.py +102 -0
  14. castor_extractor/visualization/looker_studio/client/pagination.py +31 -0
  15. castor_extractor/visualization/looker_studio/client/scopes.py +6 -0
  16. castor_extractor/visualization/sigma/client/client.py +64 -10
  17. castor_extractor/visualization/thoughtspot/assets.py +3 -1
  18. castor_extractor/visualization/thoughtspot/client/client.py +67 -14
  19. castor_extractor/visualization/thoughtspot/client/utils.py +10 -4
  20. castor_extractor/visualization/thoughtspot/client/utils_test.py +22 -4
  21. castor_extractor/warehouse/databricks/api_client.py +2 -60
  22. castor_extractor/warehouse/databricks/client.py +4 -47
  23. castor_extractor/warehouse/databricks/client_test.py +1 -35
  24. castor_extractor/warehouse/databricks/credentials.py +4 -6
  25. castor_extractor/warehouse/databricks/enums.py +15 -0
  26. castor_extractor/warehouse/databricks/extract.py +13 -11
  27. castor_extractor/warehouse/databricks/lineage.py +47 -119
  28. castor_extractor/warehouse/databricks/lineage_test.py +86 -31
  29. castor_extractor/warehouse/databricks/sql_client.py +23 -8
  30. castor_extractor/warehouse/databricks/types.py +0 -7
  31. castor_extractor/warehouse/salesforce/format.py +12 -5
  32. castor_extractor/warehouse/salesforce/format_test.py +22 -6
  33. {castor_extractor-0.22.0.dist-info → castor_extractor-0.22.5.dist-info}/METADATA +23 -1
  34. {castor_extractor-0.22.0.dist-info → castor_extractor-0.22.5.dist-info}/RECORD +37 -26
  35. castor_extractor/warehouse/databricks/test_constants.py +0 -79
  36. {castor_extractor-0.22.0.dist-info → castor_extractor-0.22.5.dist-info}/LICENCE +0 -0
  37. {castor_extractor-0.22.0.dist-info → castor_extractor-0.22.5.dist-info}/WHEEL +0 -0
  38. {castor_extractor-0.22.0.dist-info → castor_extractor-0.22.5.dist-info}/entry_points.txt +0 -0
CHANGELOG.md CHANGED
@@ -1,6 +1,26 @@
1
1
 
2
2
  # Changelog
3
3
 
4
+ ## 0.22.5 - 2025-01-09
5
+
6
+ * Databricks: validate and deduplicate lineage links
7
+
8
+ ## 0.22.4 - 2025-01-08
9
+
10
+ * ThoughtSpot: extract answers
11
+
12
+ ## 0.22.3 - 2024-12-10
13
+
14
+ * Databricks: extract lineage from system tables
15
+
16
+ ## 0.22.2 - 2024-12-06
17
+
18
+ * Sigma: multithreading to retrieve lineage
19
+
20
+ ## 0.22.1 - 2024-12-05
21
+
22
+ * Salesforce: deduplicate tables
23
+
4
24
  ## 0.22.0 - 2024-12-04
5
25
 
6
26
  * Stop supporting python3.8
@@ -45,6 +45,7 @@ from .time import (
45
45
  current_timestamp,
46
46
  date_after,
47
47
  format_date,
48
+ format_rfc_3339_date,
48
49
  past_date,
49
50
  timestamp_ms,
50
51
  yesterday,
@@ -63,5 +63,9 @@ def format_date(timestamp: Union[datetime, date]) -> str:
63
63
  return timestamp.strftime(ISO_FORMAT)
64
64
 
65
65
 
66
+ def format_rfc_3339_date(timestamp: datetime) -> str:
67
+ return timestamp.isoformat(timespec="seconds") + "Z"
68
+
69
+
66
70
  def yesterday() -> date:
67
71
  return current_date() - timedelta(days=1)
@@ -1,6 +1,6 @@
1
1
  from datetime import date, datetime
2
2
 
3
- from .time import at_midnight, date_after, timestamp_ms
3
+ from .time import at_midnight, date_after, format_rfc_3339_date, timestamp_ms
4
4
 
5
5
 
6
6
  def test_at_midnight():
@@ -17,3 +17,10 @@ def test_timestamp_ms():
17
17
  result = timestamp_ms(dt)
18
18
  expected = 670636800000
19
19
  assert result == expected
20
+
21
+
22
+ def test_format_rfc_3339_date():
23
+ dt = datetime(1995, 4, 3, 2, 1)
24
+ result = format_rfc_3339_date(dt)
25
+ expected = "1995-04-03T02:01:00Z"
26
+ assert result == expected
@@ -0,0 +1,6 @@
1
+ from .assets import LookerStudioAsset
2
+ from .client import (
3
+ LookerStudioAssetType,
4
+ LookerStudioClient,
5
+ LookerStudioCredentials,
6
+ )
@@ -0,0 +1,6 @@
1
+ from ...types import ExternalAsset
2
+
3
+
4
+ class LookerStudioAsset(ExternalAsset):
5
+ ASSETS = "assets"
6
+ VIEW_ACTIVITY = "view_activity"
@@ -0,0 +1,3 @@
1
+ from .client import LookerStudioClient
2
+ from .credentials import LookerStudioCredentials
3
+ from .enums import LookerStudioAssetType
@@ -0,0 +1,90 @@
1
+ from typing import Iterator, Optional
2
+
3
+ from google.oauth2.service_account import Credentials
4
+ from googleapiclient import discovery # type: ignore
5
+
6
+ from ....utils import (
7
+ at_midnight,
8
+ current_date,
9
+ fetch_all_pages,
10
+ format_rfc_3339_date,
11
+ past_date,
12
+ )
13
+ from .credentials import LookerStudioCredentials
14
+ from .pagination import LookerStudioPagination
15
+ from .scopes import SCOPES
16
+
17
+ USER_EMAIL_FIELD = "primaryEmail"
18
+
19
+
20
+ class AdminSDKClient:
21
+ """
22
+ Client to call the Report API and Directory API.
23
+ The service account must impersonate and admin account.
24
+ """
25
+
26
+ def __init__(self, credentials: LookerStudioCredentials):
27
+ self._credentials = Credentials.from_service_account_info(
28
+ credentials.model_dump(),
29
+ scopes=SCOPES,
30
+ subject=credentials.admin_email, # impersonates an admin
31
+ )
32
+ self.directory_api = discovery.build(
33
+ "admin", "directory_v1", credentials=self._credentials
34
+ )
35
+ self.report_api = discovery.build(
36
+ "admin", "reports_v1", credentials=self._credentials
37
+ )
38
+
39
+ def list_users(self) -> Iterator[dict]:
40
+ """
41
+ Lists all users in the domain; only the primaryEmail field is selected.
42
+ Note:
43
+ * `my_customer` is an alias to represent the account's `customerId`
44
+ * `domain_public` allows non-admins to list users. This is technically
45
+ not necessary here because an admin account is impersonated, but it
46
+ avoids tapping into unnecessary data & serves for future reference.
47
+ See
48
+ https://googleapis.github.io/google-api-python-client/docs/dyn/admin_directory_v1.users.html#list
49
+ https://developers.google.com/admin-sdk/directory/reference/rest/v1/users/list
50
+ https://developers.google.com/admin-sdk/directory/v1/guides/manage-users#retrieve_users_non_admin
51
+ https://stackoverflow.com/a/71083443/14448410
52
+ """
53
+
54
+ def _users(pagination_params: Optional[dict] = None) -> dict:
55
+ parameters = {
56
+ "viewType": "domain_public",
57
+ "customer": "my_customer",
58
+ "fields": f"users({USER_EMAIL_FIELD}), nextPageToken",
59
+ **(pagination_params or {}),
60
+ }
61
+
62
+ return self.directory_api.users().list(**parameters).execute()
63
+
64
+ yield from fetch_all_pages(_users, LookerStudioPagination)
65
+
66
+ def list_view_events(self) -> Iterator[dict]:
67
+ """
68
+ Lists all Data Studio View events of the past day.
69
+ See
70
+ https://googleapis.github.io/google-api-python-client/docs/dyn/admin_reports_v1.activities.html
71
+ https://developers.google.com/admin-sdk/reports/reference/rest/v1/activities/list
72
+ https://developers.google.com/admin-sdk/reports/v1/appendix/activity/data-studio#VIEW
73
+ """
74
+
75
+ def _activity(pagination_params: Optional[dict] = None) -> dict:
76
+ yesterday = format_rfc_3339_date(at_midnight(past_date(1)))
77
+ today = format_rfc_3339_date(at_midnight(current_date()))
78
+
79
+ parameters = {
80
+ "userKey": "all",
81
+ "applicationName": "data_studio",
82
+ "eventName": "VIEW",
83
+ "startTime": yesterday,
84
+ "endTime": today,
85
+ **(pagination_params or {}),
86
+ }
87
+
88
+ return self.report_api.activities().list(**parameters).execute()
89
+
90
+ yield from fetch_all_pages(_activity, LookerStudioPagination)
@@ -0,0 +1,37 @@
1
+ from typing import Iterator
2
+
3
+ from .. import LookerStudioAsset
4
+ from .admin_sdk_client import USER_EMAIL_FIELD, AdminSDKClient
5
+ from .credentials import LookerStudioCredentials
6
+ from .looker_studio_api_client import LookerStudioAPIClient
7
+
8
+
9
+ class LookerStudioClient:
10
+ """
11
+ Acts as a wrapper class to fetch Looker Studio assets, which requires
12
+ coordinating calls between the Admin SDK API and the Looker Studio API.
13
+ """
14
+
15
+ def __init__(self, credentials: LookerStudioCredentials):
16
+ self.admin_sdk_client = AdminSDKClient(credentials)
17
+ self.looker_studio_client = LookerStudioAPIClient(credentials)
18
+
19
+ def _get_assets(self) -> Iterator[dict]:
20
+ """
21
+ Extracts reports and data sources user by user.
22
+ """
23
+ users = self.admin_sdk_client.list_users()
24
+
25
+ for user in users:
26
+ email = user[USER_EMAIL_FIELD]
27
+ yield from self.looker_studio_client.fetch_user_assets(email)
28
+
29
+ def fetch(self, asset: LookerStudioAsset) -> Iterator[dict]:
30
+ if asset == LookerStudioAsset.VIEW_ACTIVITY:
31
+ yield from self.admin_sdk_client.list_view_events()
32
+
33
+ elif asset == LookerStudioAsset.ASSETS:
34
+ yield from self._get_assets()
35
+
36
+ else:
37
+ raise ValueError(f"The asset {asset}, is not supported")
@@ -0,0 +1,20 @@
1
+ from pydantic import BaseModel, SecretStr, field_serializer
2
+
3
+
4
+ class LookerStudioCredentials(BaseModel):
5
+ admin_email: str
6
+ auth_provider_x509_cert_url: str
7
+ auth_uri: str
8
+ client_email: str
9
+ client_id: str
10
+ client_x509_cert_url: str
11
+ private_key: SecretStr
12
+ private_key_id: str
13
+ project_id: str
14
+ token_uri: str
15
+ type: str
16
+
17
+ @field_serializer("private_key")
18
+ def dump_secret(self, pk):
19
+ """When using model_dump, show private_key value"""
20
+ return pk.get_secret_value()
@@ -0,0 +1,18 @@
1
+ class LookerStudioAPIEndpoint:
2
+ BASE_PATH = "https://datastudio.googleapis.com"
3
+
4
+ @classmethod
5
+ def search(cls) -> str:
6
+ """
7
+ Search a user's assets.
8
+ See https://developers.google.com/looker-studio/integrate/api/reference/assets/search
9
+ """
10
+ return f"{cls.BASE_PATH}/v1/assets:search"
11
+
12
+ @classmethod
13
+ def permissions(cls, asset_name: str) -> str:
14
+ """
15
+ Get the permissions of an asset. The user must be the owner of the asset.
16
+ See https://developers.google.com/looker-studio/integrate/api/reference/permissions/get
17
+ """
18
+ return f"{cls.BASE_PATH}/v1/assets/{asset_name}/permissions"
@@ -0,0 +1,8 @@
1
+ from enum import Enum
2
+
3
+
4
+ class LookerStudioAssetType(Enum):
5
+ DATA_SOURCE = "DATA_SOURCE"
6
+ EXPLORER = "EXPLORER"
7
+ REPORT = "REPORT"
8
+ WORKSPACE = "WORKSPACE"
@@ -0,0 +1,102 @@
1
+ from functools import partial
2
+ from typing import Iterator, Optional
3
+
4
+ from google.auth.transport.requests import Request
5
+ from google.oauth2.service_account import Credentials
6
+
7
+ from ....utils import (
8
+ APIClient,
9
+ BearerAuth,
10
+ fetch_all_pages,
11
+ )
12
+ from .credentials import LookerStudioCredentials
13
+ from .endpoints import LookerStudioAPIEndpoint
14
+ from .enums import LookerStudioAssetType
15
+ from .pagination import LookerStudioPagination
16
+ from .scopes import SCOPES
17
+
18
+
19
+ class LookerStudioAPIAuth(BearerAuth):
20
+ def __init__(
21
+ self,
22
+ credentials: LookerStudioCredentials,
23
+ subject: Optional[str] = None,
24
+ ):
25
+ """
26
+ Instantiates the service account credentials.
27
+ If a `subject` email is passed, the service account will impersonate
28
+ that user and make requests on that user's behalf.
29
+ """
30
+ self._credentials = Credentials.from_service_account_info(
31
+ credentials.model_dump(), scopes=SCOPES
32
+ )
33
+ if subject:
34
+ self._credentials = self._credentials.with_subject(subject)
35
+
36
+ def fetch_token(self):
37
+ self._credentials.refresh(Request())
38
+ return self._credentials.token
39
+
40
+
41
+ class LookerStudioAPIClient(APIClient):
42
+ def __init__(self, credentials: LookerStudioCredentials):
43
+ auth = LookerStudioAPIAuth(credentials=credentials)
44
+ super().__init__(auth=auth)
45
+
46
+ self._credentials = credentials
47
+
48
+ def _is_private_asset(self, asset_name: str) -> bool:
49
+ """
50
+ Returns True if the asset is not viewable by anyone other than the owner.
51
+
52
+ The permissions dict contains `Role: Member[]` key-value pairs and has
53
+ at least one key-value pair to define the asset's unique OWNER.
54
+ If another key is present, it means the asset was shared with
55
+ another person or group.
56
+
57
+ See also https://developers.google.com/looker-studio/integrate/api/reference/types#Permissions
58
+ """
59
+ data = self._get(LookerStudioAPIEndpoint.permissions(asset_name))
60
+ permissions = data["permissions"]
61
+ return len(permissions.keys()) == 1
62
+
63
+ def _user_assets(
64
+ self, asset_type: LookerStudioAssetType, user_email: str
65
+ ) -> Iterator[dict]:
66
+ """
67
+ Yields all assets of the given type, owned by the given user and visible
68
+ by other members.
69
+ """
70
+ request = partial(
71
+ self._get,
72
+ LookerStudioAPIEndpoint.search(),
73
+ params={"assetTypes": [asset_type.value]},
74
+ )
75
+ assets = fetch_all_pages(request, LookerStudioPagination)
76
+
77
+ for asset in assets:
78
+ asset_name = asset["name"]
79
+ owner = asset["owner"]
80
+ if owner == user_email and not self._is_private_asset(asset_name):
81
+ yield asset
82
+
83
+ def _impersonate_user(self, user_email: str):
84
+ self._auth = LookerStudioAPIAuth(
85
+ credentials=self._credentials, subject=user_email
86
+ )
87
+
88
+ def fetch_user_assets(self, user_email: str) -> Iterator[dict]:
89
+ """Yields assets (reports and data sources) shared by the given user."""
90
+ self._impersonate_user(user_email)
91
+
92
+ reports = self._user_assets(
93
+ asset_type=LookerStudioAssetType.REPORT,
94
+ user_email=user_email,
95
+ )
96
+ data_sources = self._user_assets(
97
+ asset_type=LookerStudioAssetType.DATA_SOURCE,
98
+ user_email=user_email,
99
+ )
100
+
101
+ yield from reports
102
+ yield from data_sources
@@ -0,0 +1,31 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import AliasChoices, ConfigDict, Field
4
+ from pydantic.alias_generators import to_camel
5
+
6
+ from ....utils import PaginationModel
7
+
8
+ NEXT_PAGE_KEY = "pageToken"
9
+
10
+
11
+ class LookerStudioPagination(PaginationModel):
12
+ items: list = Field(
13
+ default_factory=list,
14
+ validation_alias=AliasChoices("items", "users", "assets"),
15
+ )
16
+ next_page_token: Optional[str] = None
17
+
18
+ model_config = ConfigDict(
19
+ alias_generator=to_camel,
20
+ populate_by_name=True,
21
+ from_attributes=True,
22
+ )
23
+
24
+ def is_last(self) -> bool:
25
+ return self.next_page_token is None
26
+
27
+ def next_page_payload(self) -> dict:
28
+ return {NEXT_PAGE_KEY: self.next_page_token}
29
+
30
+ def page_results(self) -> list:
31
+ return self.items
@@ -0,0 +1,6 @@
1
+ SCOPES = (
2
+ "https://www.googleapis.com/auth/datastudio",
3
+ "https://www.googleapis.com/auth/userinfo.profile",
4
+ "https://www.googleapis.com/auth/admin.reports.audit.readonly",
5
+ "https://www.googleapis.com/auth/admin.directory.user.readonly",
6
+ )
@@ -1,9 +1,11 @@
1
1
  from collections.abc import Iterator
2
+ from concurrent.futures import ThreadPoolExecutor
2
3
  from functools import partial
3
4
  from http import HTTPStatus
4
5
  from typing import Callable, Optional
5
6
 
6
7
  import requests
8
+ from pydantic import BaseModel
7
9
 
8
10
  from ....utils import (
9
11
  APIClient,
@@ -12,6 +14,7 @@ from ....utils import (
12
14
  build_url,
13
15
  fetch_all_pages,
14
16
  handle_response,
17
+ retry,
15
18
  )
16
19
  from ..assets import SigmaAsset
17
20
  from .credentials import SigmaCredentials
@@ -29,7 +32,7 @@ _DATA_ELEMENTS: tuple[str, ...] = (
29
32
  )
30
33
 
31
34
  _AUTH_TIMEOUT_S = 60
32
- _SIGMA_TIMEOUT = 120
35
+ _SIGMA_TIMEOUT_S = 300
33
36
 
34
37
  _SIGMA_HEADERS = {
35
38
  "Content-Type": _CONTENT_TYPE,
@@ -47,6 +50,23 @@ SIGMA_SAFE_MODE = RequestSafeMode(
47
50
  max_errors=_VOLUME_IGNORED,
48
51
  status_codes=_IGNORED_ERROR_CODES,
49
52
  )
53
+ _THREADS_LINEAGE = 10 # empirically found; hit the rate limit with 20 workers
54
+ _RETRY_NUMBER = 1
55
+ _RETRY_BASE_MS = 60_000
56
+
57
+
58
+ class LineageContext(BaseModel):
59
+ """all info needed to build the endpoint for lineage retrieval"""
60
+
61
+ workbook_id: str
62
+ element_id: str
63
+
64
+
65
+ class Lineage(BaseModel):
66
+ """holds response from lineage API and context used to retrieve it"""
67
+
68
+ lineage: dict
69
+ context: LineageContext
50
70
 
51
71
 
52
72
  class SigmaBearerAuth(BearerAuth):
@@ -77,7 +97,7 @@ class SigmaClient(APIClient):
77
97
  host=credentials.host,
78
98
  auth=auth,
79
99
  headers=_SIGMA_HEADERS,
80
- timeout=_SIGMA_TIMEOUT,
100
+ timeout=_SIGMA_TIMEOUT_S,
81
101
  safe_mode=safe_mode or SIGMA_SAFE_MODE,
82
102
  )
83
103
 
@@ -133,17 +153,51 @@ class SigmaClient(APIClient):
133
153
  page=page, workbook_id=workbook_id
134
154
  )
135
155
 
136
- def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
156
+ @retry(
157
+ (ConnectionError,),
158
+ max_retries=_RETRY_NUMBER,
159
+ base_ms=_RETRY_BASE_MS,
160
+ log_exc_info=True,
161
+ )
162
+ def _get_lineage(self, lineage_context: LineageContext) -> Lineage:
163
+ """
164
+ return the lineage from API and other ids needed to characterize
165
+ lineage in castor
166
+ """
167
+ workbook_id = lineage_context.workbook_id
168
+ element_id = lineage_context.element_id
169
+ endpoint = SigmaEndpointFactory.lineage(workbook_id, element_id)
170
+ return Lineage(lineage=self._get(endpoint), context=lineage_context)
171
+
172
+ @staticmethod
173
+ def _lineage_context(elements: list[dict]) -> list[LineageContext]:
174
+ """
175
+ Helper function to prepare context for lineage retrieval.
176
+ Elements without associated columns are skipped.
177
+ """
178
+ contexts: list[LineageContext] = []
137
179
  for element in elements:
138
- workbook_id = element["workbook_id"]
139
- element_id = element["elementId"]
140
- lineage = self._get(
141
- endpoint=SigmaEndpointFactory.lineage(workbook_id, element_id)
180
+ if element.get("columns") is None:
181
+ continue
182
+
183
+ context = LineageContext(
184
+ workbook_id=element["workbook_id"],
185
+ element_id=element["elementId"],
142
186
  )
187
+ contexts.append(context)
188
+ return contexts
189
+
190
+ def _get_all_lineages(self, elements: list[dict]) -> Iterator[dict]:
191
+ lineage_context = self._lineage_context(elements)
192
+
193
+ with ThreadPoolExecutor(max_workers=_THREADS_LINEAGE) as executor:
194
+ results = executor.map(self._get_lineage, lineage_context)
195
+
196
+ for lineage in results:
143
197
  yield {
144
- **lineage,
145
- "workbook_id": workbook_id,
146
- "element_id": element_id,
198
+ **lineage.lineage,
199
+ "workbook_id": lineage.context.workbook_id,
200
+ "element_id": lineage.context.element_id,
147
201
  }
148
202
 
149
203
  def _get_all_queries(self, workbooks: list[dict]) -> Iterator[dict]:
@@ -4,6 +4,8 @@ from ...types import ExternalAsset
4
4
  class ThoughtspotAsset(ExternalAsset):
5
5
  """Thoughtspot assets"""
6
6
 
7
+ ANSWERS = "answers"
8
+ ANSWER_USAGES = "answer_usages"
7
9
  LIVEBOARDS = "liveboards"
10
+ LIVEBOARD_USAGES = "liveboard_usages"
8
11
  LOGICAL_TABLES = "logical_tables"
9
- USAGES = "usages"
@@ -30,7 +30,12 @@ _THOUGHTSPOT_HEADERS = {
30
30
  "Content-Type": "application/json",
31
31
  }
32
32
  _METADATA_BATCH_SIZE = 100
33
- _USAGE_LIVEBOARD_ID = "bea79810-145f-4ad0-a02c-4177a6e7d861"
33
+ # https://docs.thoughtspot.com/cloud/latest/object-usage-liveboard
34
+ _OBJECT_USAGE_LIVEBOARD = "Object Usage"
35
+ _ANSWER_USAGE_VIZ = "Answer Usage, by User"
36
+ # https://docs.thoughtspot.com/cloud/latest/user-adoption
37
+ _USER_ADOPTION_LIVEBOARD = "User Adoption"
38
+ _LIVEBOARD_USAGE_VIZ = "Popular Liveboards Last 30 Days"
34
39
  # By default, no errors are ignored for the moment
35
40
  THOUGHTSPOT_SAFE_MODE = RequestSafeMode()
36
41
 
@@ -69,23 +74,39 @@ class ThoughtspotClient(APIClient):
69
74
  def _metadata_search(
70
75
  self,
71
76
  metadata_type: str,
77
+ identifier: Optional[str] = None,
72
78
  ) -> Iterator[dict]:
79
+ """
80
+ Yields assets of the given asset type, and optionally filters on a
81
+ specific identifier.
82
+ """
73
83
  offset = 0
84
+
74
85
  while True:
86
+ search_filters = {
87
+ "metadata": [{"type": metadata_type}],
88
+ "include_details": True,
89
+ "record_size": _METADATA_BATCH_SIZE,
90
+ "record_offset": offset,
91
+ }
92
+ if identifier:
93
+ search_filters["metadata"] = {
94
+ "identifier": identifier,
95
+ "type": metadata_type,
96
+ }
97
+
75
98
  metadata = self._post(
76
99
  ThoughtspotEndpointFactory.metadata_search(),
77
- data={
78
- "metadata": [{"type": metadata_type}],
79
- "include_details": True,
80
- "record_size": _METADATA_BATCH_SIZE,
81
- "record_offset": offset,
82
- },
100
+ data=search_filters,
83
101
  )
84
102
  yield from metadata
85
103
  if len(metadata) < _METADATA_BATCH_SIZE:
86
104
  break
87
105
  offset = offset + _METADATA_BATCH_SIZE
88
106
 
107
+ def _get_all_answers(self) -> Iterator[dict]:
108
+ yield from self._metadata_search(metadata_type="ANSWER")
109
+
89
110
  def _get_all_liveboards(self) -> Iterator[dict]:
90
111
  yield from self._metadata_search(metadata_type="LIVEBOARD")
91
112
 
@@ -95,26 +116,58 @@ class ThoughtspotClient(APIClient):
95
116
  def _get_all_tables(self) -> Iterator[dict]:
96
117
  yield from self._metadata_search(metadata_type="LOGICAL_TABLE")
97
118
 
98
- def _get_liveboards_usages(self) -> Iterator[dict]:
119
+ def _get_usages(
120
+ self,
121
+ liveboard_name: str,
122
+ visualization_name: str,
123
+ ) -> Iterator[dict]:
124
+ """
125
+ Yields the data of a given visualization in the given liveboard.
126
+ ThoughtSpot maintains two system liveboards with stats about data usage,
127
+ which are useful to compute view counts and popularity.
128
+ """
129
+ usage_liveboard = next(
130
+ self._metadata_search(
131
+ metadata_type="LIVEBOARD", identifier=liveboard_name
132
+ )
133
+ )
134
+ liveboard_id = usage_liveboard["metadata_id"]
135
+
99
136
  data = self._post(
100
137
  endpoint=ThoughtspotEndpointFactory.liveboard(),
101
138
  headers={"Accept": "application/octet-stream"},
102
139
  data={
103
- "metadata_identifier": _USAGE_LIVEBOARD_ID,
140
+ "metadata_identifier": liveboard_id,
104
141
  "file_format": "CSV",
105
- "visualization_identifiers": [
106
- "Popular Liveboards Last 30 Days"
107
- ],
142
+ "visualization_identifiers": [visualization_name],
108
143
  },
109
144
  handler=lambda x: x.text,
110
145
  )
111
146
  yield from usage_liveboard_reader(data)
112
147
 
113
- def fetch(self, asset: ThoughtspotAsset):
148
+ def _get_answer_usages(self) -> Iterator[dict]:
149
+ return self._get_usages(
150
+ liveboard_name=_OBJECT_USAGE_LIVEBOARD,
151
+ visualization_name=_ANSWER_USAGE_VIZ,
152
+ )
153
+
154
+ def _get_liveboards_usages(self) -> Iterator[dict]:
155
+ return self._get_usages(
156
+ liveboard_name=_USER_ADOPTION_LIVEBOARD,
157
+ visualization_name=_LIVEBOARD_USAGE_VIZ,
158
+ )
159
+
160
+ def fetch(self, asset: ThoughtspotAsset) -> Iterator[dict]:
161
+ if asset == ThoughtspotAsset.ANSWERS:
162
+ yield from self._get_all_answers()
163
+
164
+ if asset == ThoughtspotAsset.ANSWER_USAGES:
165
+ yield from self._get_answer_usages()
166
+
114
167
  if asset == ThoughtspotAsset.LIVEBOARDS:
115
168
  yield from self._get_all_liveboards()
116
169
 
117
- if asset == ThoughtspotAsset.USAGES:
170
+ if asset == ThoughtspotAsset.LIVEBOARD_USAGES:
118
171
  yield from self._get_liveboards_usages()
119
172
 
120
173
  if asset == ThoughtspotAsset.LOGICAL_TABLES: