castor-extractor 0.24.21__py3-none-any.whl → 0.24.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +16 -0
- castor_extractor/transformation/coalesce/client/client.py +12 -12
- castor_extractor/transformation/dbt/client.py +15 -7
- castor_extractor/transformation/dbt/client_test.py +5 -5
- castor_extractor/utils/__init__.py +1 -0
- castor_extractor/utils/client/api/client.py +19 -2
- castor_extractor/utils/retry.py +11 -2
- castor_extractor/utils/salesforce/client.py +25 -0
- castor_extractor/utils/url.py +48 -0
- castor_extractor/utils/url_test.py +55 -0
- castor_extractor/visualization/powerbi/client/client.py +23 -2
- castor_extractor/visualization/powerbi/client/client_test.py +1 -1
- {castor_extractor-0.24.21.dist-info → castor_extractor-0.24.25.dist-info}/METADATA +17 -1
- {castor_extractor-0.24.21.dist-info → castor_extractor-0.24.25.dist-info}/RECORD +17 -15
- {castor_extractor-0.24.21.dist-info → castor_extractor-0.24.25.dist-info}/LICENCE +0 -0
- {castor_extractor-0.24.21.dist-info → castor_extractor-0.24.25.dist-info}/WHEEL +0 -0
- {castor_extractor-0.24.21.dist-info → castor_extractor-0.24.25.dist-info}/entry_points.txt +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,21 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.24.25 - 2025-06-12
|
|
4
|
+
|
|
5
|
+
* DBT: Fix API base url
|
|
6
|
+
|
|
7
|
+
## 0.24.24 - 2025-06-06
|
|
8
|
+
|
|
9
|
+
* Power BI: handle rate limit issues when extracting pages
|
|
10
|
+
|
|
11
|
+
## 0.24.23 - 2025-06-05
|
|
12
|
+
|
|
13
|
+
* Salesforce: print response's error message when authentication fails
|
|
14
|
+
|
|
15
|
+
## 0.24.22 - 2025-05-27
|
|
16
|
+
|
|
17
|
+
* Add retry for `Request.Timeout` on **ApiClient**
|
|
18
|
+
|
|
3
19
|
## 0.24.21 - 2025-05-26
|
|
4
20
|
|
|
5
21
|
* Looker Studio: add option to skip the extraction of view activity logs
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from http import HTTPStatus
|
|
2
3
|
from typing import Iterator, Optional
|
|
3
4
|
|
|
4
|
-
import
|
|
5
|
+
from requests import ConnectionError
|
|
5
6
|
|
|
6
7
|
from ....utils import (
|
|
7
8
|
APIClient,
|
|
8
9
|
BearerAuth,
|
|
9
10
|
RequestSafeMode,
|
|
10
11
|
SerializedAsset,
|
|
11
|
-
retry,
|
|
12
12
|
)
|
|
13
13
|
from ..assets import CoalesceAsset, CoalesceQualityAsset
|
|
14
14
|
from .credentials import CoalesceCredentials
|
|
@@ -20,9 +20,8 @@ from .utils import column_names_per_node, is_test, test_names_per_node
|
|
|
20
20
|
|
|
21
21
|
_LIMIT_MAX = 1_000
|
|
22
22
|
_MAX_ERRORS = 50
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
_RETRY_EXCEPTIONS = [requests.exceptions.ConnectTimeout]
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
26
25
|
|
|
27
26
|
|
|
28
27
|
def _run_result_payload(result: dict, query_result: dict) -> dict:
|
|
@@ -75,11 +74,6 @@ class CoalesceClient(APIClient):
|
|
|
75
74
|
result = self._get(endpoint=endpoint)
|
|
76
75
|
return result["data"]
|
|
77
76
|
|
|
78
|
-
@retry(
|
|
79
|
-
exceptions=_RETRY_EXCEPTIONS,
|
|
80
|
-
max_retries=_RETRY_COUNT,
|
|
81
|
-
base_ms=_RETRY_BASE_MS,
|
|
82
|
-
)
|
|
83
77
|
def _node_details(self, environment_id: int, node_id: str) -> dict:
|
|
84
78
|
endpoint = CoalesceEndpointFactory.nodes(
|
|
85
79
|
environment_id=environment_id, node_id=node_id
|
|
@@ -91,8 +85,14 @@ class CoalesceClient(APIClient):
|
|
|
91
85
|
result = self._get(endpoint=endpoint)
|
|
92
86
|
nodes: list[dict] = []
|
|
93
87
|
for node in result["data"]:
|
|
94
|
-
|
|
95
|
-
|
|
88
|
+
try:
|
|
89
|
+
details = self._node_details(environment_id, node["id"])
|
|
90
|
+
nodes.append({**node, **details})
|
|
91
|
+
except ConnectionError as e:
|
|
92
|
+
node_id = node["id"]
|
|
93
|
+
message = f"ConnectionError, environment: {environment_id}, node: {node_id}"
|
|
94
|
+
logger.warning(message)
|
|
95
|
+
raise e
|
|
96
96
|
return nodes
|
|
97
97
|
|
|
98
98
|
def _fetch_all_nodes(self) -> SerializedAsset:
|
|
@@ -8,12 +8,13 @@ from typing import Literal, Optional
|
|
|
8
8
|
import requests
|
|
9
9
|
from dateutil.parser import parse
|
|
10
10
|
|
|
11
|
+
from ...utils.url import add_path
|
|
11
12
|
from .credentials import DbtCredentials
|
|
12
13
|
|
|
13
14
|
logger = logging.getLogger(__name__)
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
_URL_SUFFIX = "/api/v2/accounts"
|
|
17
|
+
_URL_SUFFIX = "/api/v2/accounts/"
|
|
17
18
|
|
|
18
19
|
_DATA_KEY = "data"
|
|
19
20
|
_SUCCESSFUL_RUN_STATUS = 10
|
|
@@ -52,7 +53,7 @@ class DbtClient:
|
|
|
52
53
|
self._credentials = credentials
|
|
53
54
|
self._account_url = _account_url(self._credentials.host)
|
|
54
55
|
self._session = requests.Session()
|
|
55
|
-
self._account_id:
|
|
56
|
+
self._account_id: str = self._infer_account_id()
|
|
56
57
|
|
|
57
58
|
def _headers(self, content_type: ContentType) -> dict:
|
|
58
59
|
return {
|
|
@@ -88,16 +89,16 @@ class DbtClient:
|
|
|
88
89
|
return result[_DATA_KEY]
|
|
89
90
|
return result
|
|
90
91
|
|
|
91
|
-
def _infer_account_id(self) ->
|
|
92
|
+
def _infer_account_id(self) -> str:
|
|
92
93
|
result = self._call(url=self._account_url)
|
|
93
|
-
return result[0]["id"]
|
|
94
|
+
return str(result[0]["id"])
|
|
94
95
|
|
|
95
96
|
def list_job_identifiers(self) -> set[int]:
|
|
96
97
|
"""
|
|
97
98
|
Return the IDs of all non-deleted jobs for this account
|
|
98
99
|
https://docs.getdbt.com/dbt-cloud/api-v2-legacy#tag/Jobs/operation/listJobsForAccount
|
|
99
100
|
"""
|
|
100
|
-
url =
|
|
101
|
+
url = add_path(self._account_url, self._account_id, "jobs", "/")
|
|
101
102
|
jobs = self._call(url)
|
|
102
103
|
return {job["id"] for job in jobs if not _is_deleted(job)}
|
|
103
104
|
|
|
@@ -110,7 +111,7 @@ class DbtClient:
|
|
|
110
111
|
Extract the last successful run id, optionally filtered on a given datetime range
|
|
111
112
|
https://docs.getdbt.com/dbt-cloud/api-v2#tag/Runs/operation/listRunsForAccount
|
|
112
113
|
"""
|
|
113
|
-
url =
|
|
114
|
+
url = add_path(self._account_url, self._account_id, "runs", "/")
|
|
114
115
|
|
|
115
116
|
params = {
|
|
116
117
|
"job_definition_id": job_id or self._credentials.job_id,
|
|
@@ -142,7 +143,14 @@ class DbtClient:
|
|
|
142
143
|
Fetch dbt manifest or run results
|
|
143
144
|
https://docs.getdbt.com/dbt-cloud/api-v2-legacy#tag/Runs/operation/getArtifactsByRunId
|
|
144
145
|
"""
|
|
145
|
-
url =
|
|
146
|
+
url = add_path(
|
|
147
|
+
self._account_url,
|
|
148
|
+
self._account_id,
|
|
149
|
+
"runs",
|
|
150
|
+
str(run_id),
|
|
151
|
+
"artifacts",
|
|
152
|
+
artifact,
|
|
153
|
+
)
|
|
146
154
|
logger.info(
|
|
147
155
|
f"Extracting {artifact} from run id {run_id} with url {url}"
|
|
148
156
|
)
|
|
@@ -46,7 +46,7 @@ def test_DbtClient_last_run():
|
|
|
46
46
|
mock_response_default_job = [{"id": 1, "finished_at": _OLD_DATE_STR}]
|
|
47
47
|
mock_response_job_42 = [{"id": 2, "finished_at": _RECENT_DATE_STR}]
|
|
48
48
|
|
|
49
|
-
with patch(infer_path, return_value=40), patch(call_path) as mocked_call:
|
|
49
|
+
with patch(infer_path, return_value="40"), patch(call_path) as mocked_call:
|
|
50
50
|
credentials = DbtCredentials(token="some-token", job_id=default_job_id)
|
|
51
51
|
|
|
52
52
|
dbt_client = DbtClient(credentials=credentials)
|
|
@@ -85,7 +85,7 @@ def test_DbtClient_list_job_identifiers():
|
|
|
85
85
|
{"id": 395, "state": 1},
|
|
86
86
|
]
|
|
87
87
|
|
|
88
|
-
with patch(infer_path, return_value=40), patch(call_path) as mocked_call:
|
|
88
|
+
with patch(infer_path, return_value="40"), patch(call_path) as mocked_call:
|
|
89
89
|
mocked_call.return_value = jobs
|
|
90
90
|
credentials = DbtCredentials(token="some-token", job_id="1")
|
|
91
91
|
dbt_client = DbtClient(credentials=credentials)
|
|
@@ -100,7 +100,7 @@ def test_DbtClient_fetch_artifacts():
|
|
|
100
100
|
run_id = 12345
|
|
101
101
|
url = "https://cloud.getdbt.com/api/v2/accounts/40/runs/{}/artifacts/{}"
|
|
102
102
|
|
|
103
|
-
with patch(infer_path, return_value=40), patch(call_path) as mocked_call:
|
|
103
|
+
with patch(infer_path, return_value="40"), patch(call_path) as mocked_call:
|
|
104
104
|
credentials = DbtCredentials(token="some-token", job_id="1")
|
|
105
105
|
dbt_client = DbtClient(credentials=credentials)
|
|
106
106
|
|
|
@@ -123,7 +123,7 @@ def test_DbtClient_fetch_artifacts():
|
|
|
123
123
|
|
|
124
124
|
def test___account_url():
|
|
125
125
|
base_url = "https://cloud.getdbt.com"
|
|
126
|
-
assert _account_url(base_url) == "https://cloud.getdbt.com/api/v2/accounts"
|
|
126
|
+
assert _account_url(base_url) == "https://cloud.getdbt.com/api/v2/accounts/"
|
|
127
127
|
|
|
128
128
|
base_url = "https://emea.dbt.com/"
|
|
129
|
-
assert _account_url(base_url) == "https://emea.dbt.com/api/v2/accounts"
|
|
129
|
+
assert _account_url(base_url) == "https://emea.dbt.com/api/v2/accounts/"
|
|
@@ -52,6 +52,7 @@ from .time import (
|
|
|
52
52
|
yesterday,
|
|
53
53
|
)
|
|
54
54
|
from .type import Callback, Getter, JsonType, SerializedAsset
|
|
55
|
+
from .url import add_path as add_path_to_url, url_from
|
|
55
56
|
from .validation import clean_path, validate_baseurl
|
|
56
57
|
from .write import (
|
|
57
58
|
get_output_filename,
|
|
@@ -5,7 +5,7 @@ from typing import Callable, Literal, Optional
|
|
|
5
5
|
import requests
|
|
6
6
|
from requests import Response
|
|
7
7
|
|
|
8
|
-
from ...retry import retry_request
|
|
8
|
+
from ...retry import retry, retry_request
|
|
9
9
|
from .auth import Auth
|
|
10
10
|
from .safe_request import RequestSafeMode, handle_response
|
|
11
11
|
from .utils import build_url
|
|
@@ -21,6 +21,10 @@ DEFAULT_TIMEOUT = 60
|
|
|
21
21
|
RETRY_ON_EXPIRED_TOKEN = 1
|
|
22
22
|
RETRY_ON_GATEWAY_TIMEOUT = 3
|
|
23
23
|
|
|
24
|
+
_TIMEOUT_RETRY_BASE_MS = 10 * 60 * 1000 # 10 minutes
|
|
25
|
+
_TIMEOUT_RETRY_COUNT = 2
|
|
26
|
+
_TIMEOUT_RETRY_EXCEPTIONS = (requests.exceptions.Timeout,)
|
|
27
|
+
|
|
24
28
|
|
|
25
29
|
def _generate_payloads(
|
|
26
30
|
method: HttpMethod,
|
|
@@ -81,6 +85,7 @@ class APIClient:
|
|
|
81
85
|
params: Optional[dict] = None,
|
|
82
86
|
data: Optional[dict] = None,
|
|
83
87
|
pagination_params: Optional[dict] = None,
|
|
88
|
+
retry_on_timeout: bool = True,
|
|
84
89
|
) -> Response:
|
|
85
90
|
headers = headers or {}
|
|
86
91
|
|
|
@@ -93,7 +98,17 @@ class APIClient:
|
|
|
93
98
|
|
|
94
99
|
url = build_url(self._host, endpoint)
|
|
95
100
|
|
|
96
|
-
|
|
101
|
+
if retry_on_timeout:
|
|
102
|
+
retry_wrapper = retry(
|
|
103
|
+
exceptions=_TIMEOUT_RETRY_EXCEPTIONS,
|
|
104
|
+
max_retries=_TIMEOUT_RETRY_COUNT,
|
|
105
|
+
base_ms=_TIMEOUT_RETRY_BASE_MS,
|
|
106
|
+
)
|
|
107
|
+
request_fn = retry_wrapper(requests.request)
|
|
108
|
+
else:
|
|
109
|
+
request_fn = requests.request
|
|
110
|
+
|
|
111
|
+
return request_fn(
|
|
97
112
|
method=method,
|
|
98
113
|
url=url,
|
|
99
114
|
auth=self._auth,
|
|
@@ -119,6 +134,7 @@ class APIClient:
|
|
|
119
134
|
params: Optional[dict] = None,
|
|
120
135
|
data: Optional[dict] = None,
|
|
121
136
|
pagination_params: Optional[dict] = None,
|
|
137
|
+
retry_on_timeout: bool = True,
|
|
122
138
|
):
|
|
123
139
|
response = self._call(
|
|
124
140
|
method="GET",
|
|
@@ -127,6 +143,7 @@ class APIClient:
|
|
|
127
143
|
data=data,
|
|
128
144
|
pagination_params=pagination_params,
|
|
129
145
|
headers=headers,
|
|
146
|
+
retry_on_timeout=retry_on_timeout,
|
|
130
147
|
)
|
|
131
148
|
if response.status_code == HTTPStatus.UNAUTHORIZED:
|
|
132
149
|
self._auth.refresh_token()
|
castor_extractor/utils/retry.py
CHANGED
|
@@ -26,6 +26,15 @@ class RetryStrategy(Enum):
|
|
|
26
26
|
DEFAULT_STRATEGY = RetryStrategy.CONSTANT
|
|
27
27
|
|
|
28
28
|
|
|
29
|
+
def _warning(callable: Callable, exception: BaseException) -> None:
|
|
30
|
+
exception_type = type(exception)
|
|
31
|
+
exception_path = f"{exception_type.__module__}.{exception_type.__name__}"
|
|
32
|
+
callable_path = f"{callable.__module__}.{callable.__name__}"
|
|
33
|
+
|
|
34
|
+
msg = f"Exception '{exception_path}' occurred within `{callable_path}`"
|
|
35
|
+
logger.warning(msg)
|
|
36
|
+
|
|
37
|
+
|
|
29
38
|
class Retry(BaseModel):
|
|
30
39
|
"""
|
|
31
40
|
This class checks if the retry conditions are met, and if so, how long to
|
|
@@ -96,7 +105,7 @@ def retry(
|
|
|
96
105
|
try:
|
|
97
106
|
return None, callable(*args, **kwargs)
|
|
98
107
|
except exceptions_ as err:
|
|
99
|
-
|
|
108
|
+
_warning(callable, err)
|
|
100
109
|
return err, None
|
|
101
110
|
|
|
102
111
|
def _func(*args, **kwargs) -> Any:
|
|
@@ -139,7 +148,7 @@ def retry_request(
|
|
|
139
148
|
status_code = err.response.status_code
|
|
140
149
|
if status_code not in exceptions_:
|
|
141
150
|
raise err
|
|
142
|
-
|
|
151
|
+
_warning(callable, err)
|
|
143
152
|
return err, None
|
|
144
153
|
|
|
145
154
|
def _func(*args, **kwargs) -> Any:
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from collections.abc import Iterator
|
|
3
3
|
from functools import partial
|
|
4
|
+
from http import HTTPStatus
|
|
4
5
|
from typing import Optional
|
|
5
6
|
|
|
6
7
|
import requests
|
|
8
|
+
from requests import HTTPError, Response
|
|
7
9
|
|
|
8
10
|
from ...utils import (
|
|
9
11
|
APIClient,
|
|
@@ -21,6 +23,21 @@ logger = logging.getLogger(__name__)
|
|
|
21
23
|
SALESFORCE_TIMEOUT_S = 120
|
|
22
24
|
|
|
23
25
|
|
|
26
|
+
class SalesforceBadRequestError(HTTPError):
|
|
27
|
+
"""
|
|
28
|
+
Custom Exception to print the response's text when an error occurs
|
|
29
|
+
during Salesforce's authentication.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, response: Response):
|
|
33
|
+
text = response.text
|
|
34
|
+
message = (
|
|
35
|
+
f"{response.status_code} Client Error: {response.reason} for url: {response.url}"
|
|
36
|
+
f"\nResponse text: {text}"
|
|
37
|
+
)
|
|
38
|
+
super().__init__(message, response=response)
|
|
39
|
+
|
|
40
|
+
|
|
24
41
|
class SalesforceAuth(BearerAuth):
|
|
25
42
|
_AUTH_ENDPOINT = "services/oauth2/token"
|
|
26
43
|
|
|
@@ -29,8 +46,16 @@ class SalesforceAuth(BearerAuth):
|
|
|
29
46
|
self._token_payload = credentials.token_request_payload()
|
|
30
47
|
|
|
31
48
|
def fetch_token(self) -> Optional[str]:
|
|
49
|
+
"""
|
|
50
|
+
Fetches the access token from Salesforce using the provided credentials.
|
|
51
|
+
A custom Exception is raised if the request fails with a 400 status code.
|
|
52
|
+
"""
|
|
32
53
|
url = build_url(self._host, self._AUTH_ENDPOINT)
|
|
33
54
|
response = requests.post(url, "POST", params=self._token_payload)
|
|
55
|
+
|
|
56
|
+
if response.status_code == HTTPStatus.BAD_REQUEST:
|
|
57
|
+
raise SalesforceBadRequestError(response)
|
|
58
|
+
|
|
34
59
|
handled_response = handle_response(response)
|
|
35
60
|
return handled_response["access_token"]
|
|
36
61
|
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from urllib.parse import urlsplit, urlunsplit
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def url_from(
|
|
5
|
+
scheme: str = "",
|
|
6
|
+
netloc: str = "",
|
|
7
|
+
path: str = "",
|
|
8
|
+
query: str = "",
|
|
9
|
+
fragment: str = "",
|
|
10
|
+
) -> str:
|
|
11
|
+
"""Constructs an url from part"""
|
|
12
|
+
return urlunsplit((scheme, netloc, path, query, fragment))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def add_path(base_url: str, *paths: str) -> str:
|
|
16
|
+
"""Adds a path from a base_url."""
|
|
17
|
+
|
|
18
|
+
if not is_valid(base_url):
|
|
19
|
+
raise ValueError(f"Invalid base_url: {base_url}")
|
|
20
|
+
base_url = _format_base_url(base_url)
|
|
21
|
+
split = urlsplit(base_url)
|
|
22
|
+
|
|
23
|
+
return url_from(
|
|
24
|
+
split.scheme,
|
|
25
|
+
split.netloc,
|
|
26
|
+
"/".join([split.path] + [p.strip("/") for p in paths]),
|
|
27
|
+
split.query,
|
|
28
|
+
split.fragment,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _format_base_url(url: str) -> str:
|
|
33
|
+
"""Remove trailing slash in base url, if applicable."""
|
|
34
|
+
if url.endswith("/"):
|
|
35
|
+
return url[:-1]
|
|
36
|
+
return url
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def is_valid(
|
|
40
|
+
url: str,
|
|
41
|
+
valid_schemes: tuple[str, ...] = ("http", "https"),
|
|
42
|
+
) -> bool:
|
|
43
|
+
"""
|
|
44
|
+
Simple url validation that ensures the scheme and that there is an hostname.
|
|
45
|
+
Malformatted url can pass this check such as http://http://toto.com
|
|
46
|
+
"""
|
|
47
|
+
split = urlsplit(url)
|
|
48
|
+
return split.scheme in valid_schemes and bool(split.netloc)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from pytest import raises
|
|
2
|
+
|
|
3
|
+
from ..utils.url import (
|
|
4
|
+
add_path,
|
|
5
|
+
is_valid,
|
|
6
|
+
url_from,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_add_path():
|
|
11
|
+
base = "https://test.com"
|
|
12
|
+
|
|
13
|
+
# simple
|
|
14
|
+
assert add_path(base, "toto") == f"{base}/toto"
|
|
15
|
+
|
|
16
|
+
# multiple parts
|
|
17
|
+
assert add_path(base, "to", "ta") == f"{base}/to/ta"
|
|
18
|
+
|
|
19
|
+
# multiple parts with slash
|
|
20
|
+
assert add_path(base, "a/b", "/c/d") == f"{base}/a/b/c/d"
|
|
21
|
+
|
|
22
|
+
# base with path
|
|
23
|
+
assert add_path(f"{base}/my/path", "/1/2/", "3") == f"{base}/my/path/1/2/3"
|
|
24
|
+
|
|
25
|
+
# base with query string and fragment
|
|
26
|
+
assert add_path(f"{base}?q=2#frag", "1/2") == f"{base}/1/2?q=2#frag"
|
|
27
|
+
|
|
28
|
+
# bad base url
|
|
29
|
+
with raises(ValueError):
|
|
30
|
+
add_path("toto", "toto")
|
|
31
|
+
|
|
32
|
+
# trailing slash
|
|
33
|
+
base = "https://test.com/"
|
|
34
|
+
|
|
35
|
+
# multiple parts with slash
|
|
36
|
+
assert add_path(base, "a/b", "/c/d") == "https://test.com/a/b/c/d"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_url_is_valid():
|
|
40
|
+
# valid
|
|
41
|
+
assert is_valid("https://google.com")
|
|
42
|
+
assert is_valid("http://user:pass@test.com:444/my/path?my=query#fragment")
|
|
43
|
+
assert is_valid("ftp://hello.com", valid_schemes=("ftp",))
|
|
44
|
+
|
|
45
|
+
# invalid
|
|
46
|
+
assert not is_valid("hello.com")
|
|
47
|
+
assert not is_valid("ftp://hello.com")
|
|
48
|
+
assert not is_valid("http://")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def test_url_from():
|
|
52
|
+
assert url_from() == ""
|
|
53
|
+
assert url_from("http") == "http://"
|
|
54
|
+
assert url_from("https", "google.com") == "https://google.com"
|
|
55
|
+
assert url_from(netloc="te.st", query="q=3") == "//te.st?q=3"
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
from collections.abc import Iterator
|
|
3
3
|
from datetime import date
|
|
4
4
|
from functools import partial
|
|
5
|
+
from http import HTTPStatus
|
|
5
6
|
from time import sleep
|
|
6
7
|
from typing import Optional, Union
|
|
7
8
|
|
|
@@ -11,6 +12,7 @@ from requests import HTTPError
|
|
|
11
12
|
from ....utils import (
|
|
12
13
|
APIClient,
|
|
13
14
|
fetch_all_pages,
|
|
15
|
+
retry_request,
|
|
14
16
|
)
|
|
15
17
|
from ..assets import PowerBiAsset
|
|
16
18
|
from .authentication import PowerBiBearerAuth
|
|
@@ -27,6 +29,9 @@ METADATA_BATCH_SIZE = 100
|
|
|
27
29
|
POWERBI_SCAN_STATUS_DONE = "Succeeded"
|
|
28
30
|
POWERBI_SCAN_SLEEP_S = 1
|
|
29
31
|
|
|
32
|
+
MAX_RETRY_PAGES = 1
|
|
33
|
+
RETRY_PAGES_TIMEOUT_MS = 35 * 1000 # 35 seconds
|
|
34
|
+
|
|
30
35
|
logger = logging.getLogger(__name__)
|
|
31
36
|
|
|
32
37
|
|
|
@@ -71,6 +76,23 @@ class PowerbiClient(APIClient):
|
|
|
71
76
|
"""
|
|
72
77
|
yield from self._get(self.endpoint_factory.dashboards())[Keys.VALUE]
|
|
73
78
|
|
|
79
|
+
@retry_request(
|
|
80
|
+
status_codes=(HTTPStatus.TOO_MANY_REQUESTS,),
|
|
81
|
+
max_retries=MAX_RETRY_PAGES,
|
|
82
|
+
base_ms=RETRY_PAGES_TIMEOUT_MS,
|
|
83
|
+
)
|
|
84
|
+
def _pages(self, report_id: str) -> Iterator[dict]:
|
|
85
|
+
"""
|
|
86
|
+
Extracts the pages of a report.
|
|
87
|
+
This endpoint is very flaky and frequently returns 400 and 404 errors.
|
|
88
|
+
After around 50 requests, it hits the rate limit and returns 429 Too Many Requests,
|
|
89
|
+
which is why we retry it after a short delay.
|
|
90
|
+
Timeouts are also common; we must skip them because the extraction task
|
|
91
|
+
might take too long otherwise.
|
|
92
|
+
"""
|
|
93
|
+
pages_endpoint = self.endpoint_factory.pages(report_id)
|
|
94
|
+
return self._get(pages_endpoint, retry_on_timeout=False)[Keys.VALUE]
|
|
95
|
+
|
|
74
96
|
def _reports(self) -> Iterator[dict]:
|
|
75
97
|
"""
|
|
76
98
|
Returns a list of reports for the organization.
|
|
@@ -83,8 +105,7 @@ class PowerbiClient(APIClient):
|
|
|
83
105
|
report_id = report.get(Keys.ID)
|
|
84
106
|
|
|
85
107
|
try:
|
|
86
|
-
|
|
87
|
-
pages = self._get(pages_endpoint)[Keys.VALUE]
|
|
108
|
+
pages = self._pages(report_id)
|
|
88
109
|
report["pages"] = pages
|
|
89
110
|
except (requests.HTTPError, requests.exceptions.Timeout) as e:
|
|
90
111
|
logger.debug(e)
|
|
@@ -85,7 +85,7 @@ def test__reports(power_bi_client):
|
|
|
85
85
|
reports = list(power_bi_client._reports())
|
|
86
86
|
calls = [
|
|
87
87
|
call(ENDPOINT_FACTORY.reports()),
|
|
88
|
-
call(ENDPOINT_FACTORY.pages("1")),
|
|
88
|
+
call(ENDPOINT_FACTORY.pages("1"), retry_on_timeout=False),
|
|
89
89
|
]
|
|
90
90
|
mocked_get.assert_has_calls(calls)
|
|
91
91
|
assert reports == [
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: castor-extractor
|
|
3
|
-
Version: 0.24.
|
|
3
|
+
Version: 0.24.25
|
|
4
4
|
Summary: Extract your metadata assets.
|
|
5
5
|
Home-page: https://www.castordoc.com/
|
|
6
6
|
License: EULA
|
|
@@ -215,6 +215,22 @@ For any questions or bug report, contact us at [support@coalesce.io](mailto:supp
|
|
|
215
215
|
|
|
216
216
|
# Changelog
|
|
217
217
|
|
|
218
|
+
## 0.24.25 - 2025-06-12
|
|
219
|
+
|
|
220
|
+
* DBT: Fix API base url
|
|
221
|
+
|
|
222
|
+
## 0.24.24 - 2025-06-06
|
|
223
|
+
|
|
224
|
+
* Power BI: handle rate limit issues when extracting pages
|
|
225
|
+
|
|
226
|
+
## 0.24.23 - 2025-06-05
|
|
227
|
+
|
|
228
|
+
* Salesforce: print response's error message when authentication fails
|
|
229
|
+
|
|
230
|
+
## 0.24.22 - 2025-05-27
|
|
231
|
+
|
|
232
|
+
* Add retry for `Request.Timeout` on **ApiClient**
|
|
233
|
+
|
|
218
234
|
## 0.24.21 - 2025-05-26
|
|
219
235
|
|
|
220
236
|
* Looker Studio: add option to skip the extraction of view activity logs
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
CHANGELOG.md,sha256=
|
|
1
|
+
CHANGELOG.md,sha256=qPeyQwnnzhrZuMY_sjZ0yRGgSt_bbba2Ke3z3WSqg5U,18168
|
|
2
2
|
Dockerfile,sha256=xQ05-CFfGShT3oUqaiumaldwA288dj9Yb_pxofQpufg,301
|
|
3
3
|
DockerfileUsage.md,sha256=2hkJQF-5JuuzfPZ7IOxgM6QgIQW7l-9oRMFVwyXC4gE,998
|
|
4
4
|
LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
@@ -76,7 +76,7 @@ castor_extractor/transformation/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
76
76
|
castor_extractor/transformation/coalesce/__init__.py,sha256=CW_qdtEfwgJRsCyBlk5hNlxwEO-VV6mBXZvkRbND_J8,112
|
|
77
77
|
castor_extractor/transformation/coalesce/assets.py,sha256=pzccYPP66c9PAnVroemx7-6MeRHw7Ft1OlTC6jIamAA,363
|
|
78
78
|
castor_extractor/transformation/coalesce/client/__init__.py,sha256=VRmVpH29rOghtDQnCN7dAdA0dI0Lxseu4BC8rnwM9dU,80
|
|
79
|
-
castor_extractor/transformation/coalesce/client/client.py,sha256
|
|
79
|
+
castor_extractor/transformation/coalesce/client/client.py,sha256=yCw9xSq5acgU7mfVdNSZqZ0KTHgFUi6yiVejYw2W7Q0,6523
|
|
80
80
|
castor_extractor/transformation/coalesce/client/credentials.py,sha256=jbJxjbdPspf-dzYKfeb7oqL_8TXd1nvkJrjAcdAnLPc,548
|
|
81
81
|
castor_extractor/transformation/coalesce/client/endpoint.py,sha256=0uLh7dpA1vsR9qr_50SEYV_-heQE4BwED9oNMgYsL-w,1272
|
|
82
82
|
castor_extractor/transformation/coalesce/client/type.py,sha256=oiiVP9NL0ijTXyQmaB8aJVYckc7m-m8ZgMyNIAduUKE,43
|
|
@@ -84,8 +84,8 @@ castor_extractor/transformation/coalesce/client/utils.py,sha256=jbxh3OCbYm3fKZD1
|
|
|
84
84
|
castor_extractor/transformation/coalesce/client/utils_test.py,sha256=Q00Y1n0Q_sZ0LFnYn98yDGFumBsifzVJSc7_3PSBMfI,1543
|
|
85
85
|
castor_extractor/transformation/dbt/__init__.py,sha256=LHQROlMqYWCc7tcmhdjXtROFpJqUvCg9jPC8avHgD4I,107
|
|
86
86
|
castor_extractor/transformation/dbt/assets.py,sha256=JY1nKEGySZ84wNoe7dnizwAYw2q0t8NVaIfqhB2rSw0,148
|
|
87
|
-
castor_extractor/transformation/dbt/client.py,sha256=
|
|
88
|
-
castor_extractor/transformation/dbt/client_test.py,sha256=
|
|
87
|
+
castor_extractor/transformation/dbt/client.py,sha256=BIue1DNAn2b7kHeiXBkGNosq8jZA2DrgjP7Gi5epAPE,5684
|
|
88
|
+
castor_extractor/transformation/dbt/client_test.py,sha256=RLL7y_pLDv2QBM03qBht8yYEooeT_woRADHcb8vgBQ4,4535
|
|
89
89
|
castor_extractor/transformation/dbt/credentials.py,sha256=pGq7GqFQTw9TwN1DXSHC-0yJ2H6B_wMAbHyQTLqJVh0,543
|
|
90
90
|
castor_extractor/types.py,sha256=nHel2hv6NoHmdpOX_heEfO2-DnZPoYA2x0eJdbFvT0s,1276
|
|
91
91
|
castor_extractor/uploader/__init__.py,sha256=A4bq_SrEtKAsl0r_D_duSTvL5WIQjVfsMy7tDx9IKg0,87
|
|
@@ -96,7 +96,7 @@ castor_extractor/uploader/settings.py,sha256=3MvOX-UFRqrLZoiT7wYn9jUGro7NX4RCafY
|
|
|
96
96
|
castor_extractor/uploader/upload.py,sha256=PSQfkO_7LSE0WBo9Tm_hlS2ONepKeB0cBFdJXySnues,4310
|
|
97
97
|
castor_extractor/uploader/upload_test.py,sha256=7fwstdQe7FjuwGilsCdFpEQr1qLoR2WTRUzyy93fISw,402
|
|
98
98
|
castor_extractor/uploader/utils.py,sha256=otAaySj5aeem6f0CTd0Te6ioJ6uP2J1p348j-SdIwDI,802
|
|
99
|
-
castor_extractor/utils/__init__.py,sha256=
|
|
99
|
+
castor_extractor/utils/__init__.py,sha256=_hC54hBfPH41TTuWMsqQcyYVF7SojrOevW3OAv8M05E,1652
|
|
100
100
|
castor_extractor/utils/argument_parser.py,sha256=S4EcIh3wNDjs3fOrQnttCcPsAmG8m_Txl7xvEh0Q37s,283
|
|
101
101
|
castor_extractor/utils/argument_parser_test.py,sha256=wnyLFJ74iEiPxxLSbwFtckR7FIHxsFOVU38ljs9gqRA,633
|
|
102
102
|
castor_extractor/utils/batch.py,sha256=SFlLmJgVjV2nVhIrjVIEp8wJ9du4dKKHq8YVYubnwQQ,448
|
|
@@ -106,7 +106,7 @@ castor_extractor/utils/client/abstract.py,sha256=CWF7_afNpEZ3jor-22wXbKIvM20ukHk
|
|
|
106
106
|
castor_extractor/utils/client/api/__init__.py,sha256=vlG7WXznYgLTn3XyMGsyUkgRkup8FbKM14EXJ8mv-b0,264
|
|
107
107
|
castor_extractor/utils/client/api/auth.py,sha256=lq0K3UEl1vwIIa_vKTdlpIQPdE5K1-5DXmCwO4dKzng,1890
|
|
108
108
|
castor_extractor/utils/client/api/auth_test.py,sha256=LlyXytnatg6ZzR4Zkvzk0BH99FYhHX7qn_nyr2MSnDI,1305
|
|
109
|
-
castor_extractor/utils/client/api/client.py,sha256=
|
|
109
|
+
castor_extractor/utils/client/api/client.py,sha256=qmj7KoNqt6F-cmpdaMiz_aVxzwMCgbDNcgzXSbCdu1Y,5183
|
|
110
110
|
castor_extractor/utils/client/api/client_test.py,sha256=FM3ZxsLLfMOBn44cXX6FIgnA31-5TTNIyp9D4LBwtXE,1222
|
|
111
111
|
castor_extractor/utils/client/api/pagination.py,sha256=ph5TYqPiyFGgygsIhCATAHPIQ9UJNZyiTcqlyRdGEno,2460
|
|
112
112
|
castor_extractor/utils/client/api/pagination_test.py,sha256=jCOgXFXrH-jrCxe2dfk80ZksJF-EtmpJPU11BGabsqk,1385
|
|
@@ -138,12 +138,12 @@ castor_extractor/utils/pager/pager.py,sha256=93Rw7jCz6GnqrS4HfYfKYV2xgEx2esl1qC9
|
|
|
138
138
|
castor_extractor/utils/pager/pager_on_id.py,sha256=jBvmlEhkJ-sODkNyz1KyyXHobLsNhC4AwhOwYvLyB4E,1967
|
|
139
139
|
castor_extractor/utils/pager/pager_on_id_test.py,sha256=eDGrIYPGffuKPUATgu5fiXIwPKdSwEXGgTtfMiHqoj0,1601
|
|
140
140
|
castor_extractor/utils/pager/pager_test.py,sha256=PQOXQwQD2wOP0xzZfNTuLxcn3Bpa4FCASVklH71GO_s,1699
|
|
141
|
-
castor_extractor/utils/retry.py,sha256=
|
|
141
|
+
castor_extractor/utils/retry.py,sha256=xRlAxHRnjmjh2sDUuuUSS-s38pokoAvSgSKjdgWGqbc,5020
|
|
142
142
|
castor_extractor/utils/retry_test.py,sha256=j_6IJStBomEhxmGpIY9IIlESgMxhcDpmIKj24unLqlA,2892
|
|
143
143
|
castor_extractor/utils/safe.py,sha256=gvIMRIoggdVeYMl222IYqXnHVDninDklFMlAHt-WldA,1948
|
|
144
144
|
castor_extractor/utils/safe_test.py,sha256=IHN1Z761tYMFslYC-2HAfkXmFPh4LYSqNLs4QZwykjk,2160
|
|
145
145
|
castor_extractor/utils/salesforce/__init__.py,sha256=fZ2U6t6AFFAIC-DLXvFHBgpBDjTvX0tFgZ8zJoehPAc,88
|
|
146
|
-
castor_extractor/utils/salesforce/client.py,sha256=
|
|
146
|
+
castor_extractor/utils/salesforce/client.py,sha256=wcbJScclvSHjMf6wYNVnHjmpoC22dSshmZW9rDxXKF0,3211
|
|
147
147
|
castor_extractor/utils/salesforce/client_test.py,sha256=T3gUnZ0cRvnL_4dVc4lInRSO9Ti2WeLkLWV1scps4IY,668
|
|
148
148
|
castor_extractor/utils/salesforce/constants.py,sha256=7yPmUeyn4IHQiHLDutXE0L_OBd41E5080vFxqA_s4Dc,58
|
|
149
149
|
castor_extractor/utils/salesforce/credentials.py,sha256=m_11LIaBrYVgH2bLo-QnxaIY5KhEdtfVXz9r2lb_fd0,1123
|
|
@@ -155,6 +155,8 @@ castor_extractor/utils/string_test.py,sha256=u3P2tAPhyfCLvD19rH_JcpHhPuWTHUdg0z_
|
|
|
155
155
|
castor_extractor/utils/time.py,sha256=jmP1QWg4lv21Jp_Oy71lfJ47hjNOSgHiBOFf964RMPU,1732
|
|
156
156
|
castor_extractor/utils/time_test.py,sha256=pH8DSosNlwDYZXZNNjYDcL0WbmZc_c212LEEn88Oqew,647
|
|
157
157
|
castor_extractor/utils/type.py,sha256=Sd8JlEgbGkBUZnRqCUDtREeBkOMTXtlNMyCph90_J0Q,328
|
|
158
|
+
castor_extractor/utils/url.py,sha256=0YaKAz3EC5PgTb5A2TNOlxf1DANK40yw6hs7ArEtJaU,1238
|
|
159
|
+
castor_extractor/utils/url_test.py,sha256=LWzNdOZqjrDeLmvhPBYmP35mzhm7jGAXi021thiro1Y,1425
|
|
158
160
|
castor_extractor/utils/validation.py,sha256=dRvC9SoFVecVZuLQNN3URq37yX2sBSW3-NxIxkcol5o,1894
|
|
159
161
|
castor_extractor/utils/validation_test.py,sha256=A7P6VmI0kYX2aGIeEN12y7LsY7Kpm8pE4bdVFhbBAMw,1184
|
|
160
162
|
castor_extractor/utils/write.py,sha256=Z_RYm47XeHiUPPUMYMuAjQrVZ18CAkL3daQHQG1XPlM,2148
|
|
@@ -236,8 +238,8 @@ castor_extractor/visualization/powerbi/__init__.py,sha256=hoZ73ngLhMc9edqxO9PUIE
|
|
|
236
238
|
castor_extractor/visualization/powerbi/assets.py,sha256=IB_XKwgdN1pZYGZ4RfeHrLjflianTzWf_6tg-4CIwu0,742
|
|
237
239
|
castor_extractor/visualization/powerbi/client/__init__.py,sha256=UPIhMaCCdNxhiLdkItC0IPFE_AMi-SgqI_ahwjB9utI,151
|
|
238
240
|
castor_extractor/visualization/powerbi/client/authentication.py,sha256=cTohunKr1nUDfvxB0sejJSyfE2BdCtwT1WMPecWlbyU,1045
|
|
239
|
-
castor_extractor/visualization/powerbi/client/client.py,sha256=
|
|
240
|
-
castor_extractor/visualization/powerbi/client/client_test.py,sha256=
|
|
241
|
+
castor_extractor/visualization/powerbi/client/client.py,sha256=CWCYmj2spYin74qq9T8v2ZJ5TcxBuEy5EjArhCVZjLM,8141
|
|
242
|
+
castor_extractor/visualization/powerbi/client/client_test.py,sha256=Ox_bHpCSckEpT6IiR7drx2c9fmaVl1btUZxnwEmamGQ,5718
|
|
241
243
|
castor_extractor/visualization/powerbi/client/constants.py,sha256=88R_aGachNNUZh6OSH2fkDwZtY4KTStzKm_g7HNCqqo,387
|
|
242
244
|
castor_extractor/visualization/powerbi/client/credentials.py,sha256=OVWdhZSNODzTdLysY-sbpBZ3uUkLokeayQZnbJAqt2I,1386
|
|
243
245
|
castor_extractor/visualization/powerbi/client/credentials_test.py,sha256=TzFqxsWVQ3sXR_n0bJsexK9Uz7ceXCEPVqDGWTJzW60,993
|
|
@@ -425,8 +427,8 @@ castor_extractor/warehouse/sqlserver/queries/table.sql,sha256=kbBQP-TdG5px1IVgyx
|
|
|
425
427
|
castor_extractor/warehouse/sqlserver/queries/user.sql,sha256=gOrZsMVypusR2dc4vwVs4E1a-CliRsr_UjnD2EbXs-A,94
|
|
426
428
|
castor_extractor/warehouse/sqlserver/query.py,sha256=g0hPT-RmeGi2DyenAi3o72cTlQsLToXIFYojqc8E5fQ,533
|
|
427
429
|
castor_extractor/warehouse/synapse/queries/column.sql,sha256=lNcFoIW3Y0PFOqoOzJEXmPvZvfAsY0AP63Mu2LuPzPo,1351
|
|
428
|
-
castor_extractor-0.24.
|
|
429
|
-
castor_extractor-0.24.
|
|
430
|
-
castor_extractor-0.24.
|
|
431
|
-
castor_extractor-0.24.
|
|
432
|
-
castor_extractor-0.24.
|
|
430
|
+
castor_extractor-0.24.25.dist-info/LICENCE,sha256=sL-IGa4hweyya1HgzMskrRdybbIa2cktzxb5qmUgDg8,8254
|
|
431
|
+
castor_extractor-0.24.25.dist-info/METADATA,sha256=aJ3wfe7P_nQ9DDKS5vn9i0Ly2zps35t1-yzw-LtNeD8,25621
|
|
432
|
+
castor_extractor-0.24.25.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
433
|
+
castor_extractor-0.24.25.dist-info/entry_points.txt,sha256=_F-qeZCybjoMkNb9ErEhnyqXuG6afHIFQhakdBHZsr4,1803
|
|
434
|
+
castor_extractor-0.24.25.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|