castor-extractor 0.18.7__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of castor-extractor might be problematic. Click here for more details.
- CHANGELOG.md +40 -1
- castor_extractor/commands/extract_looker.py +3 -3
- castor_extractor/commands/extract_metabase_api.py +1 -1
- castor_extractor/commands/extract_metabase_db.py +1 -1
- castor_extractor/commands/extract_notion.py +16 -0
- castor_extractor/commands/file_check.py +5 -2
- castor_extractor/commands/upload.py +5 -3
- castor_extractor/knowledge/__init__.py +0 -0
- castor_extractor/knowledge/notion/__init__.py +3 -0
- castor_extractor/knowledge/notion/assets.py +9 -0
- castor_extractor/knowledge/notion/client/__init__.py +2 -0
- castor_extractor/knowledge/notion/client/client.py +145 -0
- castor_extractor/knowledge/notion/client/client_test.py +67 -0
- castor_extractor/knowledge/notion/client/constants.py +3 -0
- castor_extractor/knowledge/notion/client/credentials.py +16 -0
- castor_extractor/knowledge/notion/client/endpoints.py +18 -0
- castor_extractor/knowledge/notion/client/pagination.py +16 -0
- castor_extractor/knowledge/notion/extract.py +59 -0
- castor_extractor/quality/__init__.py +0 -0
- castor_extractor/quality/soda/__init__.py +2 -0
- castor_extractor/quality/soda/assets.py +8 -0
- castor_extractor/quality/soda/client/__init__.py +1 -0
- castor_extractor/quality/soda/client/client.py +99 -0
- castor_extractor/quality/soda/client/credentials.py +28 -0
- castor_extractor/quality/soda/client/endpoints.py +13 -0
- castor_extractor/types.py +1 -3
- castor_extractor/uploader/upload.py +0 -1
- castor_extractor/utils/__init__.py +2 -0
- castor_extractor/utils/argument_parser_test.py +0 -1
- castor_extractor/utils/client/api.py +29 -11
- castor_extractor/utils/client/api_test.py +9 -1
- castor_extractor/utils/object_test.py +1 -1
- castor_extractor/utils/pager/pager.py +1 -1
- castor_extractor/utils/pager/pager_on_id.py +11 -6
- castor_extractor/utils/safe_request.py +5 -3
- castor_extractor/utils/safe_request_test.py +1 -3
- castor_extractor/utils/string_test.py +1 -1
- castor_extractor/utils/time.py +11 -0
- castor_extractor/visualization/domo/client/client.py +2 -3
- castor_extractor/visualization/looker/api/client.py +35 -0
- castor_extractor/visualization/looker/api/extraction_parameters.py +2 -1
- castor_extractor/visualization/looker/extract.py +2 -2
- castor_extractor/visualization/metabase/assets.py +3 -1
- castor_extractor/visualization/metabase/extract.py +20 -8
- castor_extractor/visualization/mode/client/client.py +1 -1
- castor_extractor/visualization/powerbi/client/constants.py +1 -1
- castor_extractor/visualization/powerbi/client/rest.py +5 -15
- castor_extractor/visualization/qlik/client/engine/client.py +36 -5
- castor_extractor/visualization/qlik/client/engine/constants.py +1 -0
- castor_extractor/visualization/qlik/client/engine/error.py +18 -1
- castor_extractor/visualization/salesforce_reporting/client/soql.py +3 -1
- castor_extractor/visualization/tableau/extract.py +40 -16
- castor_extractor/visualization/tableau_revamp/client/client.py +2 -5
- castor_extractor/visualization/tableau_revamp/extract.py +3 -2
- castor_extractor/warehouse/databricks/client.py +54 -35
- castor_extractor/warehouse/databricks/client_test.py +44 -31
- castor_extractor/warehouse/salesforce/format_test.py +0 -1
- {castor_extractor-0.18.7.dist-info → castor_extractor-0.19.0.dist-info}/METADATA +4 -4
- {castor_extractor-0.18.7.dist-info → castor_extractor-0.19.0.dist-info}/RECORD +62 -43
- {castor_extractor-0.18.7.dist-info → castor_extractor-0.19.0.dist-info}/entry_points.txt +1 -0
- {castor_extractor-0.18.7.dist-info → castor_extractor-0.19.0.dist-info}/LICENCE +0 -0
- {castor_extractor-0.18.7.dist-info → castor_extractor-0.19.0.dist-info}/WHEEL +0 -0
CHANGELOG.md
CHANGED
|
@@ -1,5 +1,44 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.19.0 - 2024-08-21
|
|
4
|
+
|
|
5
|
+
* Breaking change Looker CLI:
|
|
6
|
+
|
|
7
|
+
`-u` and `--username` changed to `-c` `--client-id`
|
|
8
|
+
|
|
9
|
+
`-p` and `--password` changed to `-s` `--client-secret`
|
|
10
|
+
|
|
11
|
+
* Breaking change Metabase CLI:
|
|
12
|
+
|
|
13
|
+
`-u` and `--username` changed to `-u` `--user`
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
* Note: if you use environment variables you shouldn't be impacted
|
|
17
|
+
|
|
18
|
+
## 0.18.13 - 2024-08-19
|
|
19
|
+
|
|
20
|
+
* Qlik: improve measures extraction
|
|
21
|
+
|
|
22
|
+
## 0.18.12 - 2024-08-19
|
|
23
|
+
|
|
24
|
+
* Bumps: looker-sdk: 24, setuptools: 72
|
|
25
|
+
|
|
26
|
+
## 0.18.11 - 2023-08-14
|
|
27
|
+
|
|
28
|
+
* Linting and formatting with Ruff
|
|
29
|
+
|
|
30
|
+
## 0.18.10 - 2024-08-13
|
|
31
|
+
|
|
32
|
+
* Soda: Added Soda extractor (Castor-Managed integration only)
|
|
33
|
+
|
|
34
|
+
## 0.18.9 - 2024-08-12
|
|
35
|
+
|
|
36
|
+
* Notion: Added Notion extractor
|
|
37
|
+
|
|
38
|
+
## 0.18.8 - 2024-08-02
|
|
39
|
+
|
|
40
|
+
* Databricks: more reliable extraction of queries
|
|
41
|
+
|
|
3
42
|
## 0.18.7 - 2024-08-01
|
|
4
43
|
|
|
5
44
|
* Salesforce: extract table descriptions
|
|
@@ -420,7 +459,7 @@
|
|
|
420
459
|
|
|
421
460
|
## 0.2.2 - 2023-03-13
|
|
422
461
|
|
|
423
|
-
*
|
|
462
|
+
* Constraint `setuptools` explicitly added
|
|
424
463
|
|
|
425
464
|
## 0.2.1 - 2023-02-23
|
|
426
465
|
|
|
@@ -7,8 +7,8 @@ from castor_extractor.visualization import looker # type: ignore
|
|
|
7
7
|
def main():
|
|
8
8
|
parser = ArgumentParser()
|
|
9
9
|
parser.add_argument("-b", "--base-url", help="Looker base url")
|
|
10
|
-
parser.add_argument("-
|
|
11
|
-
parser.add_argument("-
|
|
10
|
+
parser.add_argument("-c", "--client-id", help="Looker client id")
|
|
11
|
+
parser.add_argument("-s", "--client-secret", help="Looker client secret")
|
|
12
12
|
parser.add_argument("-o", "--output", help="Directory to write to")
|
|
13
13
|
parser.add_argument("-t", "--timeout", type=int, help="Timeout in seconds")
|
|
14
14
|
parser.add_argument(
|
|
@@ -18,7 +18,7 @@ def main():
|
|
|
18
18
|
)
|
|
19
19
|
parser.add_argument(
|
|
20
20
|
"--safe-mode",
|
|
21
|
-
"-
|
|
21
|
+
"-S",
|
|
22
22
|
help="Looker safe mode",
|
|
23
23
|
action="store_true",
|
|
24
24
|
)
|
|
@@ -11,7 +11,7 @@ def main():
|
|
|
11
11
|
parser = ArgumentParser()
|
|
12
12
|
|
|
13
13
|
parser.add_argument("-b", "--base-url", help="Metabase base url")
|
|
14
|
-
parser.add_argument("-u", "--
|
|
14
|
+
parser.add_argument("-u", "--user", help="Metabase username")
|
|
15
15
|
parser.add_argument("-p", "--password", help="Metabase password")
|
|
16
16
|
|
|
17
17
|
parser.add_argument("-o", "--output", help="Directory to write to")
|
|
@@ -19,7 +19,7 @@ def main():
|
|
|
19
19
|
parser.add_argument("-P", "--port", help="TCP/IP port number")
|
|
20
20
|
parser.add_argument("-d", "--database", help="Database name")
|
|
21
21
|
parser.add_argument("-s", "--schema", help="Schema name")
|
|
22
|
-
parser.add_argument("-u", "--
|
|
22
|
+
parser.add_argument("-u", "--user", help="Username")
|
|
23
23
|
parser.add_argument("-p", "--password", help="Password")
|
|
24
24
|
parser.add_argument(
|
|
25
25
|
"-k",
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from argparse import ArgumentParser
|
|
3
|
+
|
|
4
|
+
from castor_extractor.knowledge import notion # type: ignore
|
|
5
|
+
from castor_extractor.utils import parse_filled_arguments # type: ignore
|
|
6
|
+
|
|
7
|
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
parser = ArgumentParser()
|
|
12
|
+
|
|
13
|
+
parser.add_argument("-t", "--token", help="Notion token")
|
|
14
|
+
parser.add_argument("-o", "--output", help="Directory to write to")
|
|
15
|
+
|
|
16
|
+
notion.extract_all(**parse_filled_arguments(parser))
|
|
@@ -4,8 +4,11 @@ from argparse import ArgumentParser
|
|
|
4
4
|
from typing import Set
|
|
5
5
|
|
|
6
6
|
from castor_extractor import file_checker # type: ignore
|
|
7
|
-
from castor_extractor.utils import
|
|
8
|
-
|
|
7
|
+
from castor_extractor.utils import ( # type: ignore
|
|
8
|
+
LocalStorage, # type: ignore
|
|
9
|
+
explode,
|
|
10
|
+
search_files,
|
|
11
|
+
)
|
|
9
12
|
|
|
10
13
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
|
|
11
14
|
logger = logging.getLogger(__name__)
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
import logging
|
|
3
3
|
|
|
4
|
-
from castor_extractor.uploader import
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
from castor_extractor.uploader import ( # type: ignore
|
|
5
|
+
FileType,
|
|
6
|
+
upload,
|
|
7
|
+
upload_manifest,
|
|
8
|
+
)
|
|
7
9
|
|
|
8
10
|
FILE_TYPES = {FileType.QUALITY, FileType.VIZ, FileType.WAREHOUSE}
|
|
9
11
|
|
|
File without changes
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from http import HTTPStatus
|
|
3
|
+
from typing import Dict, Iterator
|
|
4
|
+
|
|
5
|
+
from pydantic import ValidationError
|
|
6
|
+
|
|
7
|
+
from ....utils import RequestSafeMode, empty_iterator
|
|
8
|
+
from ....utils.client.api import APIClient, HttpMethod
|
|
9
|
+
from ..assets import NotionAsset
|
|
10
|
+
from .constants import (
|
|
11
|
+
CASTOR_NOTION_USER_AGENT,
|
|
12
|
+
NOTION_BASE_URL,
|
|
13
|
+
NOTION_TIMEOUT_MS,
|
|
14
|
+
)
|
|
15
|
+
from .credentials import NotionCredentials
|
|
16
|
+
from .endpoints import EndpointFactory
|
|
17
|
+
from .pagination import PaginatedResponse
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("__name__")
|
|
20
|
+
|
|
21
|
+
NOTION_VERSION = "2021-08-16"
|
|
22
|
+
|
|
23
|
+
VOLUME_IGNORED = 10
|
|
24
|
+
IGNORED_ERROR_CODES = (HTTPStatus.BAD_GATEWAY,)
|
|
25
|
+
NOTION_SAFE_MODE = RequestSafeMode(
|
|
26
|
+
max_errors=VOLUME_IGNORED,
|
|
27
|
+
status_codes=IGNORED_ERROR_CODES,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _search_filter(asset: str) -> Dict[str, Dict[str, str]]:
|
|
32
|
+
return {"filter": {"value": asset, "property": "object"}}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class NotionClient(APIClient):
|
|
36
|
+
"""Client fetching data from Notion"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
credentials: NotionCredentials,
|
|
41
|
+
timeout: int = NOTION_TIMEOUT_MS,
|
|
42
|
+
base_url: str = NOTION_BASE_URL,
|
|
43
|
+
user_agent: str = CASTOR_NOTION_USER_AGENT,
|
|
44
|
+
safe_mode: RequestSafeMode = NOTION_SAFE_MODE,
|
|
45
|
+
) -> None:
|
|
46
|
+
base_headers = {
|
|
47
|
+
"Notion-Version": NOTION_VERSION,
|
|
48
|
+
"User-Agent": user_agent,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
super().__init__(
|
|
52
|
+
host=base_url,
|
|
53
|
+
token=credentials.token,
|
|
54
|
+
headers=base_headers,
|
|
55
|
+
timeout=timeout,
|
|
56
|
+
safe_mode=safe_mode,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
def _request(
|
|
60
|
+
self, method: HttpMethod, endpoint: str, params: dict
|
|
61
|
+
) -> Iterator[dict]:
|
|
62
|
+
"""
|
|
63
|
+
API call to Notion:
|
|
64
|
+
- If result is paginated, yield all pages
|
|
65
|
+
- If not, yield only the response payload
|
|
66
|
+
"""
|
|
67
|
+
built_url = self.build_url(self._host, endpoint)
|
|
68
|
+
response_payload = self._call(
|
|
69
|
+
method=method,
|
|
70
|
+
url=built_url,
|
|
71
|
+
params=params if params and method == "GET" else None,
|
|
72
|
+
data=params if params and method == "POST" else None,
|
|
73
|
+
)
|
|
74
|
+
try:
|
|
75
|
+
paginated_response = PaginatedResponse(**response_payload)
|
|
76
|
+
yield from paginated_response.results
|
|
77
|
+
|
|
78
|
+
if not paginated_response.has_more:
|
|
79
|
+
return empty_iterator()
|
|
80
|
+
|
|
81
|
+
yield from self._request(
|
|
82
|
+
method=method,
|
|
83
|
+
endpoint=endpoint,
|
|
84
|
+
params=paginated_response.start_cursor,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
except ValidationError:
|
|
88
|
+
yield response_payload
|
|
89
|
+
|
|
90
|
+
def _page_listing(self) -> Iterator[dict]:
|
|
91
|
+
return self._request(
|
|
92
|
+
method="POST",
|
|
93
|
+
endpoint=EndpointFactory.search(),
|
|
94
|
+
params=_search_filter("page"),
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def _blocks(self, block_id: str) -> Iterator[dict]:
|
|
98
|
+
return self._request("GET", EndpointFactory.blocks(block_id), {})
|
|
99
|
+
|
|
100
|
+
def databases(self) -> Iterator[dict]:
|
|
101
|
+
return self._request(
|
|
102
|
+
method="POST",
|
|
103
|
+
endpoint=EndpointFactory.search(),
|
|
104
|
+
params=_search_filter("database"),
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def recursive_blocks(self, block_id: str) -> Iterator[dict]:
|
|
108
|
+
"""Fetch recursively all children blocks of a given block or page"""
|
|
109
|
+
blocks = self._blocks(block_id)
|
|
110
|
+
for block in blocks:
|
|
111
|
+
if block["has_children"] and block.get("type") != "child_page":
|
|
112
|
+
children = self.recursive_blocks(block["id"])
|
|
113
|
+
block["child_blocks"] = list(children)
|
|
114
|
+
|
|
115
|
+
yield block
|
|
116
|
+
|
|
117
|
+
def pages(self) -> Iterator[dict]:
|
|
118
|
+
"""Fetch all pages with its whole content"""
|
|
119
|
+
pages = list(self._page_listing())
|
|
120
|
+
logger.info(f"Extracting {len(pages)} pages ...")
|
|
121
|
+
for page in pages:
|
|
122
|
+
if page.get("object") == "database":
|
|
123
|
+
# Notion Search API filter for page doesn't work
|
|
124
|
+
continue
|
|
125
|
+
content = list(self.recursive_blocks(page["id"]))
|
|
126
|
+
page["child_blocks"] = content
|
|
127
|
+
yield page
|
|
128
|
+
|
|
129
|
+
def users(self) -> Iterator[dict]:
|
|
130
|
+
"""Fetch all users"""
|
|
131
|
+
return self._request("GET", EndpointFactory.users(), {})
|
|
132
|
+
|
|
133
|
+
def fetch(self, asset: NotionAsset) -> Iterator[dict]:
|
|
134
|
+
"""Returns the needed metadata for the queried asset"""
|
|
135
|
+
if asset == NotionAsset.PAGES:
|
|
136
|
+
yield from self.pages()
|
|
137
|
+
|
|
138
|
+
elif asset == NotionAsset.DATABASES:
|
|
139
|
+
yield from self.databases()
|
|
140
|
+
|
|
141
|
+
elif asset == NotionAsset.USERS:
|
|
142
|
+
yield from self.users()
|
|
143
|
+
|
|
144
|
+
else:
|
|
145
|
+
raise ValueError(f"This asset {asset} is unknown")
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from unittest.mock import patch
|
|
2
|
+
|
|
3
|
+
from .client import NotionClient
|
|
4
|
+
from .credentials import NotionCredentials
|
|
5
|
+
|
|
6
|
+
MOCK_PAGINATED_RESPONSE = {
|
|
7
|
+
"results": [{"result_id": 1}],
|
|
8
|
+
"next_cursor": "2",
|
|
9
|
+
"has_more": True,
|
|
10
|
+
}
|
|
11
|
+
MOCK_PAGINATED_RESPONSE_2 = {
|
|
12
|
+
"results": [{"result_id": 2}],
|
|
13
|
+
"next_cursor": None,
|
|
14
|
+
"has_more": False,
|
|
15
|
+
}
|
|
16
|
+
MOCK_RESPONSE = {"result_id": 3}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@patch.object(NotionClient, "_call")
|
|
20
|
+
def test_NotionClient__request(mock_call):
|
|
21
|
+
mock_call.side_effect = [
|
|
22
|
+
MOCK_PAGINATED_RESPONSE,
|
|
23
|
+
MOCK_PAGINATED_RESPONSE_2,
|
|
24
|
+
MOCK_RESPONSE,
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
client = NotionClient(NotionCredentials(token="MockToken"))
|
|
28
|
+
response = list(client._request("GET", "fake_endpoint", {}))
|
|
29
|
+
assert response == [{"result_id": 1}, {"result_id": 2}]
|
|
30
|
+
|
|
31
|
+
response = list(client._request("GET", "fake_endpoint", {}))
|
|
32
|
+
assert response == [{"result_id": 3}]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
MOCK_BLOCK = [{"object": "block", "id": "1", "has_children": True}]
|
|
36
|
+
MOCK_BLOCK_CHILDREN = [
|
|
37
|
+
{"object": "block", "id": "2", "has_children": False},
|
|
38
|
+
{"object": "block", "id": "3", "has_children": True, "type": "child_page"},
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@patch.object(NotionClient, "_blocks")
|
|
43
|
+
def test_NotionClient__recursive_blocks(mock_blocks):
|
|
44
|
+
mock_blocks.side_effect = [
|
|
45
|
+
MOCK_BLOCK,
|
|
46
|
+
MOCK_BLOCK_CHILDREN,
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
client = NotionClient(NotionCredentials(token="MockToken"))
|
|
50
|
+
response = list(client.recursive_blocks("1"))
|
|
51
|
+
|
|
52
|
+
assert response == [
|
|
53
|
+
{
|
|
54
|
+
"object": "block",
|
|
55
|
+
"id": "1",
|
|
56
|
+
"has_children": True,
|
|
57
|
+
"child_blocks": [
|
|
58
|
+
{"object": "block", "id": "2", "has_children": False},
|
|
59
|
+
{
|
|
60
|
+
"object": "block",
|
|
61
|
+
"id": "3",
|
|
62
|
+
"has_children": True,
|
|
63
|
+
"type": "child_page",
|
|
64
|
+
},
|
|
65
|
+
],
|
|
66
|
+
}
|
|
67
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from pydantic import Field
|
|
2
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
3
|
+
|
|
4
|
+
CASTOR_ENV_PREFIX = "CASTOR_NOTION_"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class NotionCredentials(BaseSettings):
|
|
8
|
+
"""Class to handle Notion rest API permissions"""
|
|
9
|
+
|
|
10
|
+
model_config = SettingsConfigDict(
|
|
11
|
+
env_prefix=CASTOR_ENV_PREFIX,
|
|
12
|
+
extra="ignore",
|
|
13
|
+
populate_by_name=True,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
token: str = Field(repr=False)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
class EndpointFactory:
|
|
2
|
+
"""Wrapper class around all endpoints we're using"""
|
|
3
|
+
|
|
4
|
+
BLOCKS = "blocks"
|
|
5
|
+
SEARCH = "search"
|
|
6
|
+
USERS = "users"
|
|
7
|
+
|
|
8
|
+
@classmethod
|
|
9
|
+
def users(cls) -> str:
|
|
10
|
+
return cls.USERS
|
|
11
|
+
|
|
12
|
+
@classmethod
|
|
13
|
+
def blocks(cls, block_id: str) -> str:
|
|
14
|
+
return f"{cls.BLOCKS}/{block_id}/children"
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def search(cls) -> str:
|
|
18
|
+
return cls.SEARCH
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pydantic.dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class PaginatedResponse:
|
|
8
|
+
"""Class to handle paginated results"""
|
|
9
|
+
|
|
10
|
+
results: list
|
|
11
|
+
next_cursor: Optional[str]
|
|
12
|
+
has_more: bool
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def start_cursor(self) -> dict:
|
|
16
|
+
return {"start_cursor": self.next_cursor}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Iterable, Iterator, Tuple, Union
|
|
3
|
+
|
|
4
|
+
from ...utils import (
|
|
5
|
+
OUTPUT_DIR,
|
|
6
|
+
current_timestamp,
|
|
7
|
+
deep_serialize,
|
|
8
|
+
from_env,
|
|
9
|
+
get_output_filename,
|
|
10
|
+
write_json,
|
|
11
|
+
write_summary,
|
|
12
|
+
)
|
|
13
|
+
from .assets import NotionAsset
|
|
14
|
+
from .client import NotionClient, NotionCredentials
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def iterate_all_data(
|
|
20
|
+
client: NotionClient,
|
|
21
|
+
) -> Iterable[Tuple[NotionAsset, Union[list, Iterator, dict]]]:
|
|
22
|
+
"""Iterate over the extracted data from Notion"""
|
|
23
|
+
|
|
24
|
+
logger.info("Extracting USERS from API")
|
|
25
|
+
users = list(deep_serialize(client.users()))
|
|
26
|
+
yield NotionAsset.USERS, users
|
|
27
|
+
logger.info(f"Extracted {len(users)} users from API")
|
|
28
|
+
|
|
29
|
+
logger.info("Extracting PAGES from API")
|
|
30
|
+
pages = list(deep_serialize(client.pages()))
|
|
31
|
+
yield NotionAsset.PAGES, pages
|
|
32
|
+
logger.info(f"Extracted {len(pages)} pages from API")
|
|
33
|
+
|
|
34
|
+
logger.info("Extracting DATABASES from API")
|
|
35
|
+
databases = list(deep_serialize(client.databases()))
|
|
36
|
+
yield NotionAsset.DATABASES, databases
|
|
37
|
+
logger.info(f"Extracted {len(databases)} databases from API")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def extract_all(**kwargs) -> None:
|
|
41
|
+
"""
|
|
42
|
+
Extract data from Notion API
|
|
43
|
+
Store the output files locally under the given output_directory
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
output_directory = kwargs.get("output") or from_env(OUTPUT_DIR)
|
|
47
|
+
|
|
48
|
+
credentials = NotionCredentials(**kwargs)
|
|
49
|
+
client = NotionClient(credentials=credentials)
|
|
50
|
+
|
|
51
|
+
ts = current_timestamp()
|
|
52
|
+
|
|
53
|
+
for key, data in iterate_all_data(client):
|
|
54
|
+
filename = get_output_filename(key.name.lower(), output_directory, ts)
|
|
55
|
+
write_json(filename, data)
|
|
56
|
+
|
|
57
|
+
es = current_timestamp()
|
|
58
|
+
|
|
59
|
+
write_summary(output_directory, ts, duration_second=es - ts)
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .client import SodaClient, SodaCredentials
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from time import sleep
|
|
2
|
+
from typing import Any, Iterator, Optional
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
from ....utils import format_date, yesterday
|
|
7
|
+
from ....utils.client.api import DEFAULT_TIMEOUT_S, HttpMethod
|
|
8
|
+
from ..assets import SodaAsset
|
|
9
|
+
from .credentials import SodaCredentials
|
|
10
|
+
from .endpoints import SodaEndpointFactory
|
|
11
|
+
|
|
12
|
+
_REPORTING_PAGE_SIZE = 400
|
|
13
|
+
_CLOUD_PAGE_SIZE = 100
|
|
14
|
+
_REQUESTS_PER_MINUTE = 10
|
|
15
|
+
_SECONDS_PER_MINUTE = 60
|
|
16
|
+
_RATE_LIMIT_S = (_SECONDS_PER_MINUTE // _REQUESTS_PER_MINUTE) + 1
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SodaClient:
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
credentials: SodaCredentials,
|
|
23
|
+
):
|
|
24
|
+
self._timeout = DEFAULT_TIMEOUT_S
|
|
25
|
+
self._reporting_headers = credentials.reporting_headers
|
|
26
|
+
self._auth = (credentials.api_key, credentials.secret)
|
|
27
|
+
|
|
28
|
+
def _call(
|
|
29
|
+
self,
|
|
30
|
+
url: str,
|
|
31
|
+
headers: dict,
|
|
32
|
+
method: HttpMethod = "GET",
|
|
33
|
+
*,
|
|
34
|
+
params: Optional[dict] = None,
|
|
35
|
+
data: Optional[dict] = None,
|
|
36
|
+
auth: Optional[tuple] = None,
|
|
37
|
+
) -> Any:
|
|
38
|
+
response = requests.request(
|
|
39
|
+
method,
|
|
40
|
+
url,
|
|
41
|
+
headers=headers,
|
|
42
|
+
params=params,
|
|
43
|
+
json=data,
|
|
44
|
+
timeout=self._timeout,
|
|
45
|
+
auth=auth,
|
|
46
|
+
)
|
|
47
|
+
response.raise_for_status()
|
|
48
|
+
|
|
49
|
+
return response.json()
|
|
50
|
+
|
|
51
|
+
def _get_results_paginated(self, url: str, additional: dict) -> Iterator:
|
|
52
|
+
page_number = 1
|
|
53
|
+
next_page = True
|
|
54
|
+
while next_page:
|
|
55
|
+
json_data = {
|
|
56
|
+
**{"page": page_number, "size": _REPORTING_PAGE_SIZE},
|
|
57
|
+
**additional,
|
|
58
|
+
}
|
|
59
|
+
_check_results_page = self._call(
|
|
60
|
+
url=url,
|
|
61
|
+
method="POST",
|
|
62
|
+
data=json_data,
|
|
63
|
+
headers=self._reporting_headers,
|
|
64
|
+
)
|
|
65
|
+
yield from _check_results_page["data"]
|
|
66
|
+
|
|
67
|
+
next_page = len(_check_results_page["data"]) == _REPORTING_PAGE_SIZE
|
|
68
|
+
page_number += 1
|
|
69
|
+
|
|
70
|
+
def _datasets(self) -> Iterator[dict]:
|
|
71
|
+
url = SodaEndpointFactory.datasets()
|
|
72
|
+
next_page = True
|
|
73
|
+
page_number = 0
|
|
74
|
+
while next_page:
|
|
75
|
+
data = self._call(
|
|
76
|
+
url=url,
|
|
77
|
+
method="GET",
|
|
78
|
+
headers={},
|
|
79
|
+
params={"size": _CLOUD_PAGE_SIZE, "page": page_number},
|
|
80
|
+
auth=self._auth,
|
|
81
|
+
)
|
|
82
|
+
yield from data["content"]
|
|
83
|
+
next_page = not data["last"]
|
|
84
|
+
page_number += 1
|
|
85
|
+
sleep(_RATE_LIMIT_S)
|
|
86
|
+
|
|
87
|
+
def _check_results(self) -> Iterator:
|
|
88
|
+
url = SodaEndpointFactory.check_results()
|
|
89
|
+
_date = format_date(timestamp=yesterday())
|
|
90
|
+
return self._get_results_paginated(
|
|
91
|
+
url, additional={"from_datetime": _date}
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def fetch(self, asset: SodaAsset) -> Iterator[dict]:
|
|
95
|
+
if asset == SodaAsset.DATASETS:
|
|
96
|
+
yield from self._datasets()
|
|
97
|
+
if asset == SodaAsset.CHECK_RESULTS:
|
|
98
|
+
yield from self._check_results()
|
|
99
|
+
raise ValueError(f"The asset {asset}, is not supported")
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
5
|
+
from requests.auth import HTTPBasicAuth
|
|
6
|
+
|
|
7
|
+
SODA_ENV_PREFIX = "CASTOR_SODA_"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SodaCredentials(BaseSettings):
|
|
11
|
+
"""Class to handle Soda rest API permissions"""
|
|
12
|
+
|
|
13
|
+
model_config = SettingsConfigDict(
|
|
14
|
+
env_prefix=SODA_ENV_PREFIX,
|
|
15
|
+
extra="ignore",
|
|
16
|
+
populate_by_name=True,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
api_key: str = Field(repr=False)
|
|
20
|
+
secret: str = Field(repr=False)
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def reporting_headers(self) -> Dict[str, str]:
|
|
24
|
+
return {
|
|
25
|
+
"Content-Type": "application/json",
|
|
26
|
+
"API_KEY_ID": self.api_key,
|
|
27
|
+
"API_KEY_SECRET": self.secret,
|
|
28
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
class SodaEndpointFactory:
|
|
2
|
+
"""Wrapper class around all endpoints we're using"""
|
|
3
|
+
|
|
4
|
+
CLOUD_API = "https://cloud.soda.io/api/v1"
|
|
5
|
+
REPORTING_API = "https://reporting.cloud.soda.io/v1"
|
|
6
|
+
|
|
7
|
+
@classmethod
|
|
8
|
+
def datasets(cls) -> str:
|
|
9
|
+
return f"{cls.CLOUD_API}/datasets"
|
|
10
|
+
|
|
11
|
+
@classmethod
|
|
12
|
+
def check_results(cls) -> str:
|
|
13
|
+
return f"{cls.REPORTING_API}/quality/check_results"
|
castor_extractor/types.py
CHANGED