castor-extractor 0.20.0__py3-none-any.whl → 0.20.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (43) hide show
  1. CHANGELOG.md +20 -0
  2. castor_extractor/commands/extract_redshift.py +6 -0
  3. castor_extractor/commands/extract_thoughtspot.py +18 -0
  4. castor_extractor/utils/client/api/client.py +7 -2
  5. castor_extractor/utils/client/api/safe_request.py +6 -3
  6. castor_extractor/visualization/looker/api/constants.py +0 -4
  7. castor_extractor/visualization/powerbi/__init__.py +1 -1
  8. castor_extractor/visualization/powerbi/assets.py +7 -1
  9. castor_extractor/visualization/powerbi/client/__init__.py +2 -3
  10. castor_extractor/visualization/powerbi/client/authentication.py +27 -0
  11. castor_extractor/visualization/powerbi/client/client.py +207 -0
  12. castor_extractor/visualization/powerbi/client/client_test.py +173 -0
  13. castor_extractor/visualization/powerbi/client/constants.py +0 -67
  14. castor_extractor/visualization/powerbi/client/credentials.py +3 -4
  15. castor_extractor/visualization/powerbi/client/credentials_test.py +3 -4
  16. castor_extractor/visualization/powerbi/client/endpoints.py +65 -0
  17. castor_extractor/visualization/powerbi/client/pagination.py +32 -0
  18. castor_extractor/visualization/powerbi/extract.py +14 -9
  19. castor_extractor/visualization/thoughtspot/__init__.py +3 -0
  20. castor_extractor/visualization/thoughtspot/assets.py +9 -0
  21. castor_extractor/visualization/thoughtspot/client/__init__.py +2 -0
  22. castor_extractor/visualization/thoughtspot/client/client.py +120 -0
  23. castor_extractor/visualization/thoughtspot/client/credentials.py +18 -0
  24. castor_extractor/visualization/thoughtspot/client/endpoints.py +12 -0
  25. castor_extractor/visualization/thoughtspot/client/utils.py +25 -0
  26. castor_extractor/visualization/thoughtspot/client/utils_test.py +57 -0
  27. castor_extractor/visualization/thoughtspot/extract.py +49 -0
  28. castor_extractor/warehouse/redshift/extract.py +10 -1
  29. castor_extractor/warehouse/redshift/extract_test.py +26 -0
  30. castor_extractor/warehouse/redshift/queries/query_serverless.sql +69 -0
  31. castor_extractor/warehouse/redshift/query.py +12 -1
  32. castor_extractor/warehouse/salesforce/client.py +1 -1
  33. castor_extractor/warehouse/salesforce/format.py +40 -30
  34. castor_extractor/warehouse/salesforce/format_test.py +61 -24
  35. {castor_extractor-0.20.0.dist-info → castor_extractor-0.20.5.dist-info}/METADATA +21 -1
  36. {castor_extractor-0.20.0.dist-info → castor_extractor-0.20.5.dist-info}/RECORD +39 -26
  37. {castor_extractor-0.20.0.dist-info → castor_extractor-0.20.5.dist-info}/entry_points.txt +1 -0
  38. castor_extractor/visualization/powerbi/client/rest.py +0 -305
  39. castor_extractor/visualization/powerbi/client/rest_test.py +0 -290
  40. castor_extractor/visualization/powerbi/client/utils.py +0 -19
  41. castor_extractor/visualization/powerbi/client/utils_test.py +0 -24
  42. {castor_extractor-0.20.0.dist-info → castor_extractor-0.20.5.dist-info}/LICENCE +0 -0
  43. {castor_extractor-0.20.0.dist-info → castor_extractor-0.20.5.dist-info}/WHEEL +0 -0
@@ -3,8 +3,7 @@ from typing import List, Optional
3
3
  from pydantic import Field, field_validator
4
4
  from pydantic_settings import BaseSettings, SettingsConfigDict
5
5
 
6
- from .constants import Urls
7
-
6
+ DEFAULT_SCOPE = "https://analysis.windows.net/powerbi/api/.default"
8
7
  POWERBI_ENV_PREFIX = "CASTOR_POWERBI_"
9
8
 
10
9
 
@@ -20,9 +19,9 @@ class PowerbiCredentials(BaseSettings):
20
19
  client_id: str
21
20
  tenant_id: str
22
21
  secret: str = Field(repr=False)
23
- scopes: List[str] = [Urls.DEFAULT_SCOPE]
22
+ scopes: List[str] = [DEFAULT_SCOPE]
24
23
 
25
24
  @field_validator("scopes", mode="before")
26
25
  @classmethod
27
26
  def _check_scopes(cls, scopes: Optional[List[str]]) -> List[str]:
28
- return scopes if scopes is not None else [Urls.DEFAULT_SCOPE]
27
+ return scopes if scopes is not None else [DEFAULT_SCOPE]
@@ -1,5 +1,4 @@
1
- from .constants import Urls
2
- from .credentials import PowerbiCredentials
1
+ from .credentials import DEFAULT_SCOPE, PowerbiCredentials
3
2
 
4
3
 
5
4
  def test_credentials():
@@ -13,7 +12,7 @@ def test_credentials():
13
12
  client_id=client_id,
14
13
  secret=secret,
15
14
  )
16
- assert credentials.scopes == [Urls.DEFAULT_SCOPE]
15
+ assert credentials.scopes == [DEFAULT_SCOPE]
17
16
 
18
17
  credentials = PowerbiCredentials(
19
18
  tenant_id=tenant_id,
@@ -21,7 +20,7 @@ def test_credentials():
21
20
  secret=secret,
22
21
  scopes=None,
23
22
  )
24
- assert credentials.scopes == [Urls.DEFAULT_SCOPE]
23
+ assert credentials.scopes == [DEFAULT_SCOPE]
25
24
 
26
25
  # empty scopes
27
26
  credentials = PowerbiCredentials(
@@ -0,0 +1,65 @@
1
+ from datetime import date, datetime
2
+ from typing import Optional, Tuple
3
+
4
+ from ....utils import at_midnight, format_date, yesterday
5
+
6
+ _CLIENT_APP_BASE = "https://login.microsoftonline.com"
7
+ _REST_API_BASE_PATH = "https://api.powerbi.com/v1.0/myorg"
8
+
9
+
10
+ def _time_filter(day: Optional[date]) -> Tuple[datetime, datetime]:
11
+ target_day = day or yesterday()
12
+ start = at_midnight(target_day)
13
+ end = datetime.combine(target_day, datetime.max.time())
14
+ return start, end
15
+
16
+
17
+ class PowerBiEndpointFactory:
18
+ @classmethod
19
+ def activity_events(cls, day: Optional[date]) -> str:
20
+ start, end = _time_filter(day)
21
+ url = f"{_REST_API_BASE_PATH}/admin/activityevents"
22
+ url += "?$filter=Activity eq 'viewreport'"
23
+ url += f"&startDateTime='{format_date(start)}'"
24
+ url += f"&endDateTime='{format_date(end)}'"
25
+ return url
26
+
27
+ @classmethod
28
+ def authority(cls, tenant_id: str) -> str:
29
+ return f"{_CLIENT_APP_BASE}/{tenant_id}"
30
+
31
+ @classmethod
32
+ def dashboards(cls) -> str:
33
+ return f"{_REST_API_BASE_PATH}/admin/dashboards"
34
+
35
+ @classmethod
36
+ def datasets(cls) -> str:
37
+ return f"{_REST_API_BASE_PATH}/admin/datasets"
38
+
39
+ @classmethod
40
+ def groups(cls) -> str:
41
+ return f"{_REST_API_BASE_PATH}/admin/groups"
42
+
43
+ @classmethod
44
+ def metadata_create_scan(cls) -> str:
45
+ return f"{_REST_API_BASE_PATH}/admin/workspaces/getInfo"
46
+
47
+ @classmethod
48
+ def metadata_scan_result(cls, scan_id: int) -> str:
49
+ return f"{_REST_API_BASE_PATH}/admin/workspaces/scanResult/{scan_id}"
50
+
51
+ @classmethod
52
+ def metadata_scan_status(cls, scan_id: int) -> str:
53
+ return f"{_REST_API_BASE_PATH}/admin/workspaces/scanStatus/{scan_id}"
54
+
55
+ @classmethod
56
+ def pages(cls, report_id: str) -> str:
57
+ return f"{_REST_API_BASE_PATH}/admin/reports/{report_id}/pages"
58
+
59
+ @classmethod
60
+ def reports(cls) -> str:
61
+ return f"{_REST_API_BASE_PATH}/admin/reports"
62
+
63
+ @classmethod
64
+ def workspace_ids(cls) -> str:
65
+ return f"{_REST_API_BASE_PATH}/admin/workspaces/modified"
@@ -0,0 +1,32 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import ConfigDict
4
+ from pydantic.alias_generators import to_camel
5
+
6
+ from ....utils import (
7
+ FetchNextPageBy,
8
+ PaginationModel,
9
+ )
10
+
11
+
12
+ class PowerBiPagination(PaginationModel):
13
+ model_config = ConfigDict(
14
+ alias_generator=to_camel,
15
+ populate_by_name=True,
16
+ from_attributes=True,
17
+ )
18
+
19
+ fetch_by: FetchNextPageBy = FetchNextPageBy.URL
20
+
21
+ activity_event_entities: list
22
+ continuation_uri: Optional[str] = None
23
+ last_result_set: bool = False
24
+
25
+ def is_last(self) -> bool:
26
+ return self.last_result_set
27
+
28
+ def next_page_payload(self) -> Optional[str]:
29
+ return self.continuation_uri
30
+
31
+ def page_results(self) -> list:
32
+ return self.activity_event_entities
@@ -1,4 +1,5 @@
1
- from typing import Iterable, List, Tuple, Union
1
+ import logging
2
+ from typing import Iterable, Tuple, Union
2
3
 
3
4
  from ...utils import (
4
5
  OUTPUT_DIR,
@@ -9,19 +10,23 @@ from ...utils import (
9
10
  write_json,
10
11
  write_summary,
11
12
  )
12
- from .assets import METADATA_ASSETS, PowerBiAsset
13
- from .client import Client, PowerbiCredentials
13
+ from .assets import METADATA_ASSETS, REPORTS_ASSETS, PowerBiAsset
14
+ from .client import PowerbiClient, PowerbiCredentials
15
+
16
+ logger = logging.getLogger(__name__)
14
17
 
15
18
 
16
19
  def iterate_all_data(
17
- client: Client,
18
- ) -> Iterable[Tuple[PowerBiAsset, Union[List, dict]]]:
20
+ client: PowerbiClient,
21
+ ) -> Iterable[Tuple[PowerBiAsset, Union[list, dict]]]:
19
22
  for asset in PowerBiAsset:
20
- if asset in METADATA_ASSETS:
23
+ if asset in METADATA_ASSETS + REPORTS_ASSETS:
21
24
  continue
22
25
 
23
- data = client.fetch(asset)
24
- yield asset, deep_serialize(data)
26
+ logger.info(f"Extracting {asset.name} from API")
27
+ data = list(deep_serialize(client.fetch(asset)))
28
+ yield asset, data
29
+ logger.info(f"Extracted {len(data)} {asset.name} from API")
25
30
 
26
31
 
27
32
  def extract_all(**kwargs) -> None:
@@ -31,7 +36,7 @@ def extract_all(**kwargs) -> None:
31
36
  """
32
37
  _output_directory = kwargs.get("output") or from_env(OUTPUT_DIR)
33
38
  creds = PowerbiCredentials(**kwargs)
34
- client = Client(creds)
39
+ client = PowerbiClient(creds)
35
40
  ts = current_timestamp()
36
41
 
37
42
  for key, data in iterate_all_data(client):
@@ -0,0 +1,3 @@
1
+ from .assets import ThoughtspotAsset
2
+ from .client import ThoughtspotClient, ThoughtspotCredentials
3
+ from .extract import extract_all, iterate_all_data
@@ -0,0 +1,9 @@
1
+ from ...types import ExternalAsset
2
+
3
+
4
+ class ThoughtspotAsset(ExternalAsset):
5
+ """Thoughtspot assets"""
6
+
7
+ LIVEBOARDS = "liveboards"
8
+ LOGICAL_TABLES = "logical_tables"
9
+ USAGES = "usages"
@@ -0,0 +1,2 @@
1
+ from .client import ThoughtspotClient
2
+ from .credentials import ThoughtspotCredentials
@@ -0,0 +1,120 @@
1
+ from typing import Dict, Iterator, Optional
2
+
3
+ import requests
4
+
5
+ from ....utils import (
6
+ APIClient,
7
+ BearerAuth,
8
+ RequestSafeMode,
9
+ build_url,
10
+ handle_response,
11
+ )
12
+ from ..assets import (
13
+ ThoughtspotAsset,
14
+ )
15
+ from .credentials import (
16
+ ThoughtspotCredentials,
17
+ )
18
+ from .endpoints import (
19
+ ThoughtspotEndpointFactory,
20
+ )
21
+ from .utils import (
22
+ usage_liveboard_reader,
23
+ )
24
+
25
+ _AUTH_TIMEOUT_S = 60
26
+ _THOUGHTSPOT_HEADERS = {
27
+ "X-Requested-By": "ThoughtSpot",
28
+ "Accept": "application/json",
29
+ "Content-Type": "application/json",
30
+ }
31
+ _METADATA_BATCH_SIZE = 100
32
+ _USAGE_LIVEBOARD_ID = "bea79810-145f-4ad0-a02c-4177a6e7d861"
33
+ # By default, no errors are ignored for the moment
34
+ THOUGHTSPOT_SAFE_MODE = RequestSafeMode()
35
+
36
+
37
+ class ThoughtspotBearerAuth(BearerAuth):
38
+ def __init__(self, host: str, token_payload: Dict[str, str]):
39
+ auth_endpoint = ThoughtspotEndpointFactory.authentication()
40
+ self.authentication_url = build_url(host, auth_endpoint)
41
+ self.token_payload = token_payload
42
+
43
+ def fetch_token(self):
44
+ token_api_path = self.authentication_url
45
+ token_response = requests.post(
46
+ token_api_path, data=self.token_payload, timeout=_AUTH_TIMEOUT_S
47
+ )
48
+ return handle_response(token_response)["token"]
49
+
50
+
51
+ class ThoughtspotClient(APIClient):
52
+ def __init__(
53
+ self,
54
+ credentials: ThoughtspotCredentials,
55
+ safe_mode: Optional[RequestSafeMode] = None,
56
+ ):
57
+ auth = ThoughtspotBearerAuth(
58
+ host=credentials.base_url,
59
+ token_payload=credentials.dict(),
60
+ )
61
+ super().__init__(
62
+ host=credentials.base_url,
63
+ auth=auth,
64
+ headers=_THOUGHTSPOT_HEADERS,
65
+ safe_mode=safe_mode or THOUGHTSPOT_SAFE_MODE,
66
+ )
67
+
68
+ def _metadata_search(
69
+ self,
70
+ metadata_type: str,
71
+ ) -> Iterator[dict]:
72
+ offset = 0
73
+ while True:
74
+ metadata = self._post(
75
+ ThoughtspotEndpointFactory.metadata_search(),
76
+ data={
77
+ "metadata": [{"type": metadata_type}],
78
+ "include_details": True,
79
+ "record_size": _METADATA_BATCH_SIZE,
80
+ "record_offset": offset,
81
+ },
82
+ )
83
+ yield from metadata
84
+ if len(metadata) < _METADATA_BATCH_SIZE:
85
+ break
86
+ offset = offset + _METADATA_BATCH_SIZE
87
+
88
+ def _get_all_liveboards(self) -> Iterator[dict]:
89
+ yield from self._metadata_search(metadata_type="LIVEBOARD")
90
+
91
+ def _get_all_columns(self) -> Iterator[dict]:
92
+ yield from self._metadata_search(metadata_type="LOGICAL_COLUMN")
93
+
94
+ def _get_all_tables(self) -> Iterator[dict]:
95
+ yield from self._metadata_search(metadata_type="LOGICAL_TABLE")
96
+
97
+ def _get_liveboards_usages(self) -> Iterator[dict]:
98
+ data = self._post(
99
+ endpoint=ThoughtspotEndpointFactory.liveboard(),
100
+ headers={"Accept": "application/octet-stream"},
101
+ data={
102
+ "metadata_identifier": _USAGE_LIVEBOARD_ID,
103
+ "file_format": "CSV",
104
+ "visualization_identifiers": [
105
+ "Popular Liveboards Last 30 Days"
106
+ ],
107
+ },
108
+ handler=lambda x: x.text,
109
+ )
110
+ yield from usage_liveboard_reader(data)
111
+
112
+ def fetch(self, asset: ThoughtspotAsset):
113
+ if asset == ThoughtspotAsset.LIVEBOARDS:
114
+ yield from self._get_all_liveboards()
115
+
116
+ if asset == ThoughtspotAsset.USAGES:
117
+ yield from self._get_liveboards_usages()
118
+
119
+ if asset == ThoughtspotAsset.LOGICAL_TABLES:
120
+ yield from self._get_all_tables()
@@ -0,0 +1,18 @@
1
+ from pydantic import Field
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+
4
+ THOUGHTSPOT_ENV_PREFIX = "CASTOR_THOUGHTSPOT_"
5
+
6
+
7
+ class ThoughtspotCredentials(BaseSettings):
8
+ """Class to handle Thoughtspot rest API permissions"""
9
+
10
+ model_config = SettingsConfigDict(
11
+ env_prefix=THOUGHTSPOT_ENV_PREFIX,
12
+ extra="ignore",
13
+ populate_by_name=True,
14
+ )
15
+
16
+ username: str
17
+ password: str = Field(repr=False)
18
+ base_url: str
@@ -0,0 +1,12 @@
1
+ class ThoughtspotEndpointFactory:
2
+ @classmethod
3
+ def authentication(cls) -> str:
4
+ return "api/rest/2.0/auth/token/full"
5
+
6
+ @classmethod
7
+ def metadata_search(cls) -> str:
8
+ return "api/rest/2.0/metadata/search"
9
+
10
+ @classmethod
11
+ def liveboard(cls) -> str:
12
+ return "api/rest/2.0/report/liveboard"
@@ -0,0 +1,25 @@
1
+ import csv
2
+ from io import StringIO
3
+ from typing import Iterator
4
+
5
+
6
+ def usage_liveboard_reader(usage_liveboard_csv: str) -> Iterator[dict]:
7
+ """
8
+ Converts a CSV string into an iterator of dictionaries after
9
+ ignoring the first 6 lines, using the 7th line as the header.
10
+ First 6 lines looks like the following:
11
+
12
+ "Data extract produced by Castor on 09/19/2024 06:54"
13
+ "Filters applied on data :"
14
+ "User Action IN [pinboard_embed_view,pinboard_tspublic_no_runtime_filter,pinboard_tspublic_runtime_filter,pinboard_view]"
15
+ "Pinboard NOT IN [mlm - availability pinboard,null]"
16
+ "Timestamp >= 20240820 00:00:00 < 20240919 00:00:00"
17
+ "Timestamp >= 20240919 00:00:00 < 20240920 00:00:00"
18
+
19
+ """
20
+ csv_file = StringIO(usage_liveboard_csv)
21
+
22
+ for _ in range(7):
23
+ next(csv_file)
24
+
25
+ yield from csv.DictReader(csv_file)
@@ -0,0 +1,57 @@
1
+ from .utils import (
2
+ usage_liveboard_reader,
3
+ )
4
+
5
+ VALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
6
+ "Filters applied on data :"
7
+ "User Action IN [pinboard_embed_view,pinboard_tspublic_no_runtime_filter,pinboard_tspublic_runtime_filter,pinboard_view]"
8
+ "Pinboard NOT IN [mlm - availability pinboard,null]"
9
+ "Timestamp >= 20240820 00:00:00 < 20240919 00:00:00"
10
+ "Timestamp >= 20240919 00:00:00 < 20240920 00:00:00"
11
+ ""
12
+ "Pinboard","Pinboard Views","Unique Number of User"
13
+ "Market Report","559","19"
14
+ "Retailer report","204","14"
15
+ "Second-hand market","72","6"
16
+ "September test","25","2"'''
17
+
18
+
19
+ # Invalid CSV input (missing data rows)
20
+ INVALID_CSV = '''"Data extract produced by Castor on 09/19/2024 06:54"
21
+ "Filters applied on data :"
22
+ "User Action IN [pinboard_embed_view,pinboard_tspublic_no_runtime_filter,pinboard_tspublic_runtime_filter,pinboard_view]"
23
+ "Pinboard NOT IN [mlm - availability pinboard,null]"
24
+ "Timestamp >= 20240820 00:00:00 < 20240919 00:00:00"
25
+ "Timestamp >= 20240919 00:00:00 < 20240920 00:00:00"
26
+ ""'''
27
+
28
+
29
+ def test_usage_liveboard_reader():
30
+ expected_output = [
31
+ {
32
+ "Pinboard": "Market Report",
33
+ "Pinboard Views": "559",
34
+ "Unique Number of User": "19",
35
+ },
36
+ {
37
+ "Pinboard": "Retailer report",
38
+ "Pinboard Views": "204",
39
+ "Unique Number of User": "14",
40
+ },
41
+ {
42
+ "Pinboard": "Second-hand market",
43
+ "Pinboard Views": "72",
44
+ "Unique Number of User": "6",
45
+ },
46
+ {
47
+ "Pinboard": "September test",
48
+ "Pinboard Views": "25",
49
+ "Unique Number of User": "2",
50
+ },
51
+ ]
52
+
53
+ result = list(usage_liveboard_reader(VALID_CSV))
54
+ assert result == expected_output
55
+
56
+ result = list(usage_liveboard_reader(INVALID_CSV))
57
+ assert result == [] # Expect an empty result since there is no data
@@ -0,0 +1,49 @@
1
+ import logging
2
+ from typing import Iterable, Iterator, Tuple, Union
3
+
4
+ from ...utils import (
5
+ OUTPUT_DIR,
6
+ current_timestamp,
7
+ deep_serialize,
8
+ from_env,
9
+ get_output_filename,
10
+ write_json,
11
+ write_summary,
12
+ )
13
+ from .assets import ThoughtspotAsset
14
+ from .client import (
15
+ ThoughtspotClient,
16
+ ThoughtspotCredentials,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def iterate_all_data(
23
+ client: ThoughtspotClient,
24
+ ) -> Iterable[Tuple[ThoughtspotAsset, Union[list, Iterator, dict]]]:
25
+ """Iterate over the extracted data from Thoughtspot"""
26
+
27
+ for asset in ThoughtspotAsset:
28
+ logger.info(f"Extracting {asset.value} from API")
29
+ data = client.fetch(asset)
30
+ yield asset, deep_serialize(data)
31
+
32
+
33
+ def extract_all(**kwargs) -> None:
34
+ """
35
+ Extract data from Thoughtspot API
36
+ Store the output files locally under the given output_directory
37
+ """
38
+ _output_directory = kwargs.get("output") or from_env(OUTPUT_DIR)
39
+
40
+ credentials = ThoughtspotCredentials(**kwargs)
41
+ client = ThoughtspotClient(credentials=credentials)
42
+
43
+ ts = current_timestamp()
44
+
45
+ for key, data in iterate_all_data(client):
46
+ filename = get_output_filename(key.name.lower(), _output_directory, ts)
47
+ write_json(filename, list(data))
48
+
49
+ write_summary(_output_directory, ts, host=credentials.base_url)
@@ -34,6 +34,7 @@ REDSHIFT_PASSWORD = "CASTOR_REDSHIFT_PASSWORD" # noqa: S105
34
34
  REDSHIFT_HOST = "CASTOR_REDSHIFT_HOST"
35
35
  REDSHIFT_PORT = "CASTOR_REDSHIFT_PORT"
36
36
  REDSHIFT_DATABASE = "CASTOR_REDSHIFT_DATABASE"
37
+ REDSHIFT_SERVERLESS = "CASTOR_REDSHIFT_SERVERLESS"
37
38
 
38
39
 
39
40
  def _credentials(params: dict) -> dict:
@@ -48,6 +49,14 @@ def _credentials(params: dict) -> dict:
48
49
  }
49
50
 
50
51
 
52
+ def _query_builder(params: dict) -> RedshiftQueryBuilder:
53
+ env_parameter = from_env(REDSHIFT_SERVERLESS, allow_missing=True)
54
+ from_env_ = str(env_parameter).lower() == "true"
55
+ from_params_ = params.get("serverless", False)
56
+ is_serverless = from_params_ or from_env_
57
+ return RedshiftQueryBuilder(is_serverless=is_serverless)
58
+
59
+
51
60
  def extract_all(**kwargs) -> None:
52
61
  """
53
62
  Extract all assets from Redshift and store the results in CSV files
@@ -56,7 +65,7 @@ def extract_all(**kwargs) -> None:
56
65
 
57
66
  client = RedshiftClient(credentials=_credentials(kwargs))
58
67
 
59
- query_builder = RedshiftQueryBuilder()
68
+ query_builder = _query_builder(kwargs)
60
69
 
61
70
  storage = LocalStorage(directory=output_directory)
62
71
 
@@ -0,0 +1,26 @@
1
+ import pytest
2
+
3
+ from .extract import (
4
+ REDSHIFT_SERVERLESS,
5
+ _query_builder,
6
+ )
7
+
8
+
9
+ @pytest.mark.parametrize(
10
+ "serverless_param,env_param,expected",
11
+ [
12
+ (True, "False", True),
13
+ (False, "True", True),
14
+ (None, "TRUE", True),
15
+ (None, "TrUe", True),
16
+ (None, "FAlSE", False),
17
+ (None, "False", False),
18
+ (None, None, False),
19
+ (True, None, True),
20
+ ],
21
+ )
22
+ def test__query_builder(serverless_param, env_param, expected, monkeypatch):
23
+ params = {"serverless": serverless_param}
24
+ monkeypatch.setenv(REDSHIFT_SERVERLESS, env_param)
25
+
26
+ assert _query_builder(params).is_serverless == expected
@@ -0,0 +1,69 @@
1
+ WITH parameters AS (
2
+ SELECT
3
+ :day AS day_start,
4
+ :hour_min AS hour_min,
5
+ :hour_max AS hour_max
6
+ ),
7
+
8
+ queries_deduplicated AS (
9
+ SELECT DISTINCT q.query_id
10
+ FROM SYS_QUERY_HISTORY AS q
11
+ CROSS JOIN parameters AS p
12
+ WHERE TRUE
13
+ AND DATE(q.start_time) = p.day_start
14
+ AND EXTRACT('hour' FROM q.start_time) BETWEEN p.hour_min AND p.hour_max
15
+ ),
16
+
17
+ query AS (
18
+ SELECT
19
+ q.query_id,
20
+ qt.text,
21
+ qt.sequence,
22
+ COUNT(*) OVER(PARTITION BY q.query_id) AS sequence_count
23
+ FROM queries_deduplicated AS q
24
+ INNER JOIN SYS_QUERY_TEXT AS qt ON q.query_id = qt.query_id
25
+ ),
26
+
27
+ raw_query_text AS
28
+ (
29
+ SELECT
30
+ q.query_id,
31
+ LISTAGG(q.text, '') WITHIN GROUP (ORDER BY q.sequence) AS agg_text
32
+ FROM query AS q
33
+ WHERE TRUE
34
+ -- LISTAGG raises an error when total length >= 65535
35
+ -- each query text contains 4000 char max
36
+ AND q.sequence_count < (65535 / 4000)
37
+ GROUP BY q.query_id
38
+ ),
39
+
40
+ query_text AS (
41
+ SELECT
42
+ query_id,
43
+ CASE
44
+ WHEN agg_text ILIKE 'INSERT INTO%%'
45
+ THEN REGEXP_REPLACE(agg_text, 'VALUES (.*)', 'DEFAULT VALUES')
46
+ ELSE agg_text
47
+ END AS agg_text
48
+ FROM raw_query_text
49
+ )
50
+ SELECT
51
+ q.query_id::VARCHAR(256) AS query_id,
52
+ qt.agg_text::VARCHAR(60000) AS query_text,
53
+ q.database_name AS database_id,
54
+ q.database_name AS database_name,
55
+ q.session_id AS process_id,
56
+ 0 as aborted,
57
+ q.start_time AS start_time,
58
+ q.end_time AS end_time,
59
+ q.user_id AS user_id,
60
+ q.query_label,
61
+ u.usename AS user_name
62
+ FROM SYS_QUERY_HISTORY AS q
63
+ JOIN query_text AS qt ON q.query_id = qt.query_id
64
+ JOIN pg_catalog.pg_user AS u ON u.usesysid = q.user_id
65
+ CROSS JOIN parameters AS p
66
+ WHERE TRUE
67
+ AND DATE(q.start_time) = p.day_start
68
+ AND EXTRACT('hour' FROM q.start_time) BETWEEN p.hour_min AND p.hour_max
69
+ AND q.status = 'success'
@@ -15,10 +15,21 @@ class RedshiftQueryBuilder(AbstractQueryBuilder):
15
15
 
16
16
  def __init__(
17
17
  self,
18
+ is_serverless: bool = False,
18
19
  time_filter: Optional[TimeFilter] = None,
19
20
  ):
20
21
  super().__init__(time_filter=time_filter)
22
+ self.is_serverless = is_serverless
23
+
24
+ def build_query_serverless(self) -> ExtractionQuery:
25
+ """To get the query history in Redshift Serverless, we cannot use STL tables."""
26
+ statement = self._load_from_file("query_serverless.sql")
27
+ params = self._time_filter.to_dict()
28
+ return ExtractionQuery(statement, params)
21
29
 
22
30
  def build(self, asset: WarehouseAsset) -> List[ExtractionQuery]:
23
- query = self.build_default(asset)
31
+ if asset == WarehouseAsset.QUERY and self.is_serverless:
32
+ query = self.build_query_serverless()
33
+ else:
34
+ query = self.build_default(asset)
24
35
  return [query]
@@ -89,4 +89,4 @@ class SalesforceClient(SalesforceBaseClient):
89
89
  ):
90
90
  fields = self.fetch_fields(api_name)
91
91
  sobject_fields[table_name] = fields
92
- return self.formatter.columns(sobject_fields)
92
+ return list(self.formatter.columns(sobject_fields))