castor-extractor 0.18.5__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (69) hide show
  1. CHANGELOG.md +48 -1
  2. castor_extractor/commands/extract_looker.py +3 -3
  3. castor_extractor/commands/extract_metabase_api.py +1 -1
  4. castor_extractor/commands/extract_metabase_db.py +1 -1
  5. castor_extractor/commands/extract_notion.py +16 -0
  6. castor_extractor/commands/file_check.py +5 -2
  7. castor_extractor/commands/upload.py +5 -3
  8. castor_extractor/knowledge/__init__.py +0 -0
  9. castor_extractor/knowledge/notion/__init__.py +3 -0
  10. castor_extractor/knowledge/notion/assets.py +9 -0
  11. castor_extractor/knowledge/notion/client/__init__.py +2 -0
  12. castor_extractor/knowledge/notion/client/client.py +145 -0
  13. castor_extractor/knowledge/notion/client/client_test.py +67 -0
  14. castor_extractor/knowledge/notion/client/constants.py +3 -0
  15. castor_extractor/knowledge/notion/client/credentials.py +16 -0
  16. castor_extractor/knowledge/notion/client/endpoints.py +18 -0
  17. castor_extractor/knowledge/notion/client/pagination.py +16 -0
  18. castor_extractor/knowledge/notion/extract.py +59 -0
  19. castor_extractor/quality/__init__.py +0 -0
  20. castor_extractor/quality/soda/__init__.py +2 -0
  21. castor_extractor/quality/soda/assets.py +8 -0
  22. castor_extractor/quality/soda/client/__init__.py +1 -0
  23. castor_extractor/quality/soda/client/client.py +99 -0
  24. castor_extractor/quality/soda/client/credentials.py +28 -0
  25. castor_extractor/quality/soda/client/endpoints.py +13 -0
  26. castor_extractor/types.py +1 -3
  27. castor_extractor/uploader/upload.py +0 -1
  28. castor_extractor/utils/__init__.py +2 -0
  29. castor_extractor/utils/argument_parser_test.py +0 -1
  30. castor_extractor/utils/client/api.py +29 -11
  31. castor_extractor/utils/client/api_test.py +9 -1
  32. castor_extractor/utils/object_test.py +1 -1
  33. castor_extractor/utils/pager/pager.py +1 -1
  34. castor_extractor/utils/pager/pager_on_id.py +11 -6
  35. castor_extractor/utils/safe_request.py +5 -3
  36. castor_extractor/utils/safe_request_test.py +1 -3
  37. castor_extractor/utils/string_test.py +1 -1
  38. castor_extractor/utils/time.py +11 -0
  39. castor_extractor/visualization/domo/client/client.py +2 -3
  40. castor_extractor/visualization/looker/api/client.py +35 -0
  41. castor_extractor/visualization/looker/api/extraction_parameters.py +2 -1
  42. castor_extractor/visualization/looker/extract.py +2 -2
  43. castor_extractor/visualization/metabase/assets.py +3 -1
  44. castor_extractor/visualization/metabase/extract.py +20 -8
  45. castor_extractor/visualization/mode/client/client.py +1 -1
  46. castor_extractor/visualization/powerbi/client/constants.py +1 -1
  47. castor_extractor/visualization/powerbi/client/rest.py +5 -15
  48. castor_extractor/visualization/qlik/client/engine/client.py +36 -5
  49. castor_extractor/visualization/qlik/client/engine/constants.py +1 -0
  50. castor_extractor/visualization/qlik/client/engine/error.py +18 -1
  51. castor_extractor/visualization/salesforce_reporting/client/soql.py +3 -1
  52. castor_extractor/visualization/tableau/extract.py +40 -16
  53. castor_extractor/visualization/tableau_revamp/client/client.py +2 -5
  54. castor_extractor/visualization/tableau_revamp/extract.py +3 -2
  55. castor_extractor/warehouse/bigquery/client.py +41 -6
  56. castor_extractor/warehouse/bigquery/extract.py +1 -0
  57. castor_extractor/warehouse/bigquery/query.py +23 -9
  58. castor_extractor/warehouse/bigquery/types.py +1 -2
  59. castor_extractor/warehouse/databricks/client.py +54 -35
  60. castor_extractor/warehouse/databricks/client_test.py +44 -31
  61. castor_extractor/warehouse/salesforce/client.py +28 -3
  62. castor_extractor/warehouse/salesforce/format.py +1 -1
  63. castor_extractor/warehouse/salesforce/format_test.py +1 -2
  64. castor_extractor/warehouse/salesforce/soql.py +6 -1
  65. {castor_extractor-0.18.5.dist-info → castor_extractor-0.19.0.dist-info}/METADATA +4 -4
  66. {castor_extractor-0.18.5.dist-info → castor_extractor-0.19.0.dist-info}/RECORD +69 -50
  67. {castor_extractor-0.18.5.dist-info → castor_extractor-0.19.0.dist-info}/entry_points.txt +1 -0
  68. {castor_extractor-0.18.5.dist-info → castor_extractor-0.19.0.dist-info}/LICENCE +0 -0
  69. {castor_extractor-0.18.5.dist-info → castor_extractor-0.19.0.dist-info}/WHEEL +0 -0
CHANGELOG.md CHANGED
@@ -1,5 +1,52 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.19.0 - 2024-08-21
4
+
5
+ * Breaking change Looker CLI:
6
+
7
+ `-u` and `--username` changed to `-c` `--client-id`
8
+
9
+ `-p` and `--password` changed to `-s` `--client-secret`
10
+
11
+ * Breaking change Metabase CLI:
12
+
13
+ `-u` and `--username` changed to `-u` `--user`
14
+
15
+
16
+ * Note: if you use environment variables you shouldn't be impacted
17
+
18
+ ## 0.18.13 - 2024-08-19
19
+
20
+ * Qlik: improve measures extraction
21
+
22
+ ## 0.18.12 - 2024-08-19
23
+
24
+ * Bumps: looker-sdk: 24, setuptools: 72
25
+
26
+ ## 0.18.11 - 2023-08-14
27
+
28
+ * Linting and formatting with Ruff
29
+
30
+ ## 0.18.10 - 2024-08-13
31
+
32
+ * Soda: Added Soda extractor (Castor-Managed integration only)
33
+
34
+ ## 0.18.9 - 2024-08-12
35
+
36
+ * Notion: Added Notion extractor
37
+
38
+ ## 0.18.8 - 2024-08-02
39
+
40
+ * Databricks: more reliable extraction of queries
41
+
42
+ ## 0.18.7 - 2024-08-01
43
+
44
+ * Salesforce: extract table descriptions
45
+
46
+ ## 0.18.6 - 2024-07-30
47
+
48
+ * BigQuery: introduce extended regions to extract missing queries
49
+
3
50
  ## 0.18.5 - 2024-07-17
4
51
 
5
52
  * Salesforce: extract DeveloperName and tooling url
@@ -412,7 +459,7 @@
412
459
 
413
460
  ## 0.2.2 - 2023-03-13
414
461
 
415
- * Constrain `setuptools` explicitly added
462
+ * Constraint `setuptools` explicitly added
416
463
 
417
464
  ## 0.2.1 - 2023-02-23
418
465
 
@@ -7,8 +7,8 @@ from castor_extractor.visualization import looker # type: ignore
7
7
  def main():
8
8
  parser = ArgumentParser()
9
9
  parser.add_argument("-b", "--base-url", help="Looker base url")
10
- parser.add_argument("-u", "--username", help="Looker client id")
11
- parser.add_argument("-p", "--password", help="Looker client secret")
10
+ parser.add_argument("-c", "--client-id", help="Looker client id")
11
+ parser.add_argument("-s", "--client-secret", help="Looker client secret")
12
12
  parser.add_argument("-o", "--output", help="Directory to write to")
13
13
  parser.add_argument("-t", "--timeout", type=int, help="Timeout in seconds")
14
14
  parser.add_argument(
@@ -18,7 +18,7 @@ def main():
18
18
  )
19
19
  parser.add_argument(
20
20
  "--safe-mode",
21
- "-s",
21
+ "-S",
22
22
  help="Looker safe mode",
23
23
  action="store_true",
24
24
  )
@@ -11,7 +11,7 @@ def main():
11
11
  parser = ArgumentParser()
12
12
 
13
13
  parser.add_argument("-b", "--base-url", help="Metabase base url")
14
- parser.add_argument("-u", "--username", help="Metabase username")
14
+ parser.add_argument("-u", "--user", help="Metabase username")
15
15
  parser.add_argument("-p", "--password", help="Metabase password")
16
16
 
17
17
  parser.add_argument("-o", "--output", help="Directory to write to")
@@ -19,7 +19,7 @@ def main():
19
19
  parser.add_argument("-P", "--port", help="TCP/IP port number")
20
20
  parser.add_argument("-d", "--database", help="Database name")
21
21
  parser.add_argument("-s", "--schema", help="Schema name")
22
- parser.add_argument("-u", "--username", help="Username")
22
+ parser.add_argument("-u", "--user", help="Username")
23
23
  parser.add_argument("-p", "--password", help="Password")
24
24
  parser.add_argument(
25
25
  "-k",
@@ -0,0 +1,16 @@
1
+ import logging
2
+ from argparse import ArgumentParser
3
+
4
+ from castor_extractor.knowledge import notion # type: ignore
5
+ from castor_extractor.utils import parse_filled_arguments # type: ignore
6
+
7
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
8
+
9
+
10
+ def main():
11
+ parser = ArgumentParser()
12
+
13
+ parser.add_argument("-t", "--token", help="Notion token")
14
+ parser.add_argument("-o", "--output", help="Directory to write to")
15
+
16
+ notion.extract_all(**parse_filled_arguments(parser))
@@ -4,8 +4,11 @@ from argparse import ArgumentParser
4
4
  from typing import Set
5
5
 
6
6
  from castor_extractor import file_checker # type: ignore
7
- from castor_extractor.utils import LocalStorage # type: ignore
8
- from castor_extractor.utils import explode, search_files # type: ignore
7
+ from castor_extractor.utils import ( # type: ignore
8
+ LocalStorage, # type: ignore
9
+ explode,
10
+ search_files,
11
+ )
9
12
 
10
13
  logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
11
14
  logger = logging.getLogger(__name__)
@@ -1,9 +1,11 @@
1
1
  import argparse
2
2
  import logging
3
3
 
4
- from castor_extractor.uploader import FileType # type: ignore
5
- from castor_extractor.uploader import upload # type: ignore
6
- from castor_extractor.uploader import upload_manifest # type: ignore
4
+ from castor_extractor.uploader import ( # type: ignore
5
+ FileType,
6
+ upload,
7
+ upload_manifest,
8
+ )
7
9
 
8
10
  FILE_TYPES = {FileType.QUALITY, FileType.VIZ, FileType.WAREHOUSE}
9
11
 
File without changes
@@ -0,0 +1,3 @@
1
+ from .assets import NotionAsset
2
+ from .client import NotionClient, NotionCredentials
3
+ from .extract import extract_all
@@ -0,0 +1,9 @@
1
+ from ...types import ExternalAsset
2
+
3
+
4
+ class NotionAsset(ExternalAsset):
5
+ """Notion assets"""
6
+
7
+ DATABASES = "databases"
8
+ PAGES = "pages"
9
+ USERS = "users"
@@ -0,0 +1,2 @@
1
+ from .client import NotionClient
2
+ from .credentials import NotionCredentials
@@ -0,0 +1,145 @@
1
+ import logging
2
+ from http import HTTPStatus
3
+ from typing import Dict, Iterator
4
+
5
+ from pydantic import ValidationError
6
+
7
+ from ....utils import RequestSafeMode, empty_iterator
8
+ from ....utils.client.api import APIClient, HttpMethod
9
+ from ..assets import NotionAsset
10
+ from .constants import (
11
+ CASTOR_NOTION_USER_AGENT,
12
+ NOTION_BASE_URL,
13
+ NOTION_TIMEOUT_MS,
14
+ )
15
+ from .credentials import NotionCredentials
16
+ from .endpoints import EndpointFactory
17
+ from .pagination import PaginatedResponse
18
+
19
+ logger = logging.getLogger("__name__")
20
+
21
+ NOTION_VERSION = "2021-08-16"
22
+
23
+ VOLUME_IGNORED = 10
24
+ IGNORED_ERROR_CODES = (HTTPStatus.BAD_GATEWAY,)
25
+ NOTION_SAFE_MODE = RequestSafeMode(
26
+ max_errors=VOLUME_IGNORED,
27
+ status_codes=IGNORED_ERROR_CODES,
28
+ )
29
+
30
+
31
+ def _search_filter(asset: str) -> Dict[str, Dict[str, str]]:
32
+ return {"filter": {"value": asset, "property": "object"}}
33
+
34
+
35
+ class NotionClient(APIClient):
36
+ """Client fetching data from Notion"""
37
+
38
+ def __init__(
39
+ self,
40
+ credentials: NotionCredentials,
41
+ timeout: int = NOTION_TIMEOUT_MS,
42
+ base_url: str = NOTION_BASE_URL,
43
+ user_agent: str = CASTOR_NOTION_USER_AGENT,
44
+ safe_mode: RequestSafeMode = NOTION_SAFE_MODE,
45
+ ) -> None:
46
+ base_headers = {
47
+ "Notion-Version": NOTION_VERSION,
48
+ "User-Agent": user_agent,
49
+ }
50
+
51
+ super().__init__(
52
+ host=base_url,
53
+ token=credentials.token,
54
+ headers=base_headers,
55
+ timeout=timeout,
56
+ safe_mode=safe_mode,
57
+ )
58
+
59
+ def _request(
60
+ self, method: HttpMethod, endpoint: str, params: dict
61
+ ) -> Iterator[dict]:
62
+ """
63
+ API call to Notion:
64
+ - If result is paginated, yield all pages
65
+ - If not, yield only the response payload
66
+ """
67
+ built_url = self.build_url(self._host, endpoint)
68
+ response_payload = self._call(
69
+ method=method,
70
+ url=built_url,
71
+ params=params if params and method == "GET" else None,
72
+ data=params if params and method == "POST" else None,
73
+ )
74
+ try:
75
+ paginated_response = PaginatedResponse(**response_payload)
76
+ yield from paginated_response.results
77
+
78
+ if not paginated_response.has_more:
79
+ return empty_iterator()
80
+
81
+ yield from self._request(
82
+ method=method,
83
+ endpoint=endpoint,
84
+ params=paginated_response.start_cursor,
85
+ )
86
+
87
+ except ValidationError:
88
+ yield response_payload
89
+
90
+ def _page_listing(self) -> Iterator[dict]:
91
+ return self._request(
92
+ method="POST",
93
+ endpoint=EndpointFactory.search(),
94
+ params=_search_filter("page"),
95
+ )
96
+
97
+ def _blocks(self, block_id: str) -> Iterator[dict]:
98
+ return self._request("GET", EndpointFactory.blocks(block_id), {})
99
+
100
+ def databases(self) -> Iterator[dict]:
101
+ return self._request(
102
+ method="POST",
103
+ endpoint=EndpointFactory.search(),
104
+ params=_search_filter("database"),
105
+ )
106
+
107
+ def recursive_blocks(self, block_id: str) -> Iterator[dict]:
108
+ """Fetch recursively all children blocks of a given block or page"""
109
+ blocks = self._blocks(block_id)
110
+ for block in blocks:
111
+ if block["has_children"] and block.get("type") != "child_page":
112
+ children = self.recursive_blocks(block["id"])
113
+ block["child_blocks"] = list(children)
114
+
115
+ yield block
116
+
117
+ def pages(self) -> Iterator[dict]:
118
+ """Fetch all pages with its whole content"""
119
+ pages = list(self._page_listing())
120
+ logger.info(f"Extracting {len(pages)} pages ...")
121
+ for page in pages:
122
+ if page.get("object") == "database":
123
+ # Notion Search API filter for page doesn't work
124
+ continue
125
+ content = list(self.recursive_blocks(page["id"]))
126
+ page["child_blocks"] = content
127
+ yield page
128
+
129
+ def users(self) -> Iterator[dict]:
130
+ """Fetch all users"""
131
+ return self._request("GET", EndpointFactory.users(), {})
132
+
133
+ def fetch(self, asset: NotionAsset) -> Iterator[dict]:
134
+ """Returns the needed metadata for the queried asset"""
135
+ if asset == NotionAsset.PAGES:
136
+ yield from self.pages()
137
+
138
+ elif asset == NotionAsset.DATABASES:
139
+ yield from self.databases()
140
+
141
+ elif asset == NotionAsset.USERS:
142
+ yield from self.users()
143
+
144
+ else:
145
+ raise ValueError(f"This asset {asset} is unknown")
@@ -0,0 +1,67 @@
1
+ from unittest.mock import patch
2
+
3
+ from .client import NotionClient
4
+ from .credentials import NotionCredentials
5
+
6
+ MOCK_PAGINATED_RESPONSE = {
7
+ "results": [{"result_id": 1}],
8
+ "next_cursor": "2",
9
+ "has_more": True,
10
+ }
11
+ MOCK_PAGINATED_RESPONSE_2 = {
12
+ "results": [{"result_id": 2}],
13
+ "next_cursor": None,
14
+ "has_more": False,
15
+ }
16
+ MOCK_RESPONSE = {"result_id": 3}
17
+
18
+
19
+ @patch.object(NotionClient, "_call")
20
+ def test_NotionClient__request(mock_call):
21
+ mock_call.side_effect = [
22
+ MOCK_PAGINATED_RESPONSE,
23
+ MOCK_PAGINATED_RESPONSE_2,
24
+ MOCK_RESPONSE,
25
+ ]
26
+
27
+ client = NotionClient(NotionCredentials(token="MockToken"))
28
+ response = list(client._request("GET", "fake_endpoint", {}))
29
+ assert response == [{"result_id": 1}, {"result_id": 2}]
30
+
31
+ response = list(client._request("GET", "fake_endpoint", {}))
32
+ assert response == [{"result_id": 3}]
33
+
34
+
35
+ MOCK_BLOCK = [{"object": "block", "id": "1", "has_children": True}]
36
+ MOCK_BLOCK_CHILDREN = [
37
+ {"object": "block", "id": "2", "has_children": False},
38
+ {"object": "block", "id": "3", "has_children": True, "type": "child_page"},
39
+ ]
40
+
41
+
42
+ @patch.object(NotionClient, "_blocks")
43
+ def test_NotionClient__recursive_blocks(mock_blocks):
44
+ mock_blocks.side_effect = [
45
+ MOCK_BLOCK,
46
+ MOCK_BLOCK_CHILDREN,
47
+ ]
48
+
49
+ client = NotionClient(NotionCredentials(token="MockToken"))
50
+ response = list(client.recursive_blocks("1"))
51
+
52
+ assert response == [
53
+ {
54
+ "object": "block",
55
+ "id": "1",
56
+ "has_children": True,
57
+ "child_blocks": [
58
+ {"object": "block", "id": "2", "has_children": False},
59
+ {
60
+ "object": "block",
61
+ "id": "3",
62
+ "has_children": True,
63
+ "type": "child_page",
64
+ },
65
+ ],
66
+ }
67
+ ]
@@ -0,0 +1,3 @@
1
+ NOTION_BASE_URL = "https://api.notion.com/v1/"
2
+ CASTOR_NOTION_USER_AGENT = "castor-extractor"
3
+ NOTION_TIMEOUT_MS = 300
@@ -0,0 +1,16 @@
1
+ from pydantic import Field
2
+ from pydantic_settings import BaseSettings, SettingsConfigDict
3
+
4
+ CASTOR_ENV_PREFIX = "CASTOR_NOTION_"
5
+
6
+
7
+ class NotionCredentials(BaseSettings):
8
+ """Class to handle Notion rest API permissions"""
9
+
10
+ model_config = SettingsConfigDict(
11
+ env_prefix=CASTOR_ENV_PREFIX,
12
+ extra="ignore",
13
+ populate_by_name=True,
14
+ )
15
+
16
+ token: str = Field(repr=False)
@@ -0,0 +1,18 @@
1
+ class EndpointFactory:
2
+ """Wrapper class around all endpoints we're using"""
3
+
4
+ BLOCKS = "blocks"
5
+ SEARCH = "search"
6
+ USERS = "users"
7
+
8
+ @classmethod
9
+ def users(cls) -> str:
10
+ return cls.USERS
11
+
12
+ @classmethod
13
+ def blocks(cls, block_id: str) -> str:
14
+ return f"{cls.BLOCKS}/{block_id}/children"
15
+
16
+ @classmethod
17
+ def search(cls) -> str:
18
+ return cls.SEARCH
@@ -0,0 +1,16 @@
1
+ from typing import Optional
2
+
3
+ from pydantic.dataclasses import dataclass
4
+
5
+
6
+ @dataclass
7
+ class PaginatedResponse:
8
+ """Class to handle paginated results"""
9
+
10
+ results: list
11
+ next_cursor: Optional[str]
12
+ has_more: bool
13
+
14
+ @property
15
+ def start_cursor(self) -> dict:
16
+ return {"start_cursor": self.next_cursor}
@@ -0,0 +1,59 @@
1
+ import logging
2
+ from typing import Iterable, Iterator, Tuple, Union
3
+
4
+ from ...utils import (
5
+ OUTPUT_DIR,
6
+ current_timestamp,
7
+ deep_serialize,
8
+ from_env,
9
+ get_output_filename,
10
+ write_json,
11
+ write_summary,
12
+ )
13
+ from .assets import NotionAsset
14
+ from .client import NotionClient, NotionCredentials
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def iterate_all_data(
20
+ client: NotionClient,
21
+ ) -> Iterable[Tuple[NotionAsset, Union[list, Iterator, dict]]]:
22
+ """Iterate over the extracted data from Notion"""
23
+
24
+ logger.info("Extracting USERS from API")
25
+ users = list(deep_serialize(client.users()))
26
+ yield NotionAsset.USERS, users
27
+ logger.info(f"Extracted {len(users)} users from API")
28
+
29
+ logger.info("Extracting PAGES from API")
30
+ pages = list(deep_serialize(client.pages()))
31
+ yield NotionAsset.PAGES, pages
32
+ logger.info(f"Extracted {len(pages)} pages from API")
33
+
34
+ logger.info("Extracting DATABASES from API")
35
+ databases = list(deep_serialize(client.databases()))
36
+ yield NotionAsset.DATABASES, databases
37
+ logger.info(f"Extracted {len(databases)} databases from API")
38
+
39
+
40
+ def extract_all(**kwargs) -> None:
41
+ """
42
+ Extract data from Notion API
43
+ Store the output files locally under the given output_directory
44
+ """
45
+
46
+ output_directory = kwargs.get("output") or from_env(OUTPUT_DIR)
47
+
48
+ credentials = NotionCredentials(**kwargs)
49
+ client = NotionClient(credentials=credentials)
50
+
51
+ ts = current_timestamp()
52
+
53
+ for key, data in iterate_all_data(client):
54
+ filename = get_output_filename(key.name.lower(), output_directory, ts)
55
+ write_json(filename, data)
56
+
57
+ es = current_timestamp()
58
+
59
+ write_summary(output_directory, ts, duration_second=es - ts)
File without changes
@@ -0,0 +1,2 @@
1
+ from .assets import SodaAsset
2
+ from .client import SodaClient, SodaCredentials
@@ -0,0 +1,8 @@
1
+ from ...types import ExternalAsset
2
+
3
+
4
+ class SodaAsset(ExternalAsset):
5
+ """Soda assets"""
6
+
7
+ CHECK_RESULTS = "check_results"
8
+ DATASETS = "datasets"
@@ -0,0 +1 @@
1
+ from .client import SodaClient, SodaCredentials
@@ -0,0 +1,99 @@
1
+ from time import sleep
2
+ from typing import Any, Iterator, Optional
3
+
4
+ import requests
5
+
6
+ from ....utils import format_date, yesterday
7
+ from ....utils.client.api import DEFAULT_TIMEOUT_S, HttpMethod
8
+ from ..assets import SodaAsset
9
+ from .credentials import SodaCredentials
10
+ from .endpoints import SodaEndpointFactory
11
+
12
+ _REPORTING_PAGE_SIZE = 400
13
+ _CLOUD_PAGE_SIZE = 100
14
+ _REQUESTS_PER_MINUTE = 10
15
+ _SECONDS_PER_MINUTE = 60
16
+ _RATE_LIMIT_S = (_SECONDS_PER_MINUTE // _REQUESTS_PER_MINUTE) + 1
17
+
18
+
19
+ class SodaClient:
20
+ def __init__(
21
+ self,
22
+ credentials: SodaCredentials,
23
+ ):
24
+ self._timeout = DEFAULT_TIMEOUT_S
25
+ self._reporting_headers = credentials.reporting_headers
26
+ self._auth = (credentials.api_key, credentials.secret)
27
+
28
+ def _call(
29
+ self,
30
+ url: str,
31
+ headers: dict,
32
+ method: HttpMethod = "GET",
33
+ *,
34
+ params: Optional[dict] = None,
35
+ data: Optional[dict] = None,
36
+ auth: Optional[tuple] = None,
37
+ ) -> Any:
38
+ response = requests.request(
39
+ method,
40
+ url,
41
+ headers=headers,
42
+ params=params,
43
+ json=data,
44
+ timeout=self._timeout,
45
+ auth=auth,
46
+ )
47
+ response.raise_for_status()
48
+
49
+ return response.json()
50
+
51
+ def _get_results_paginated(self, url: str, additional: dict) -> Iterator:
52
+ page_number = 1
53
+ next_page = True
54
+ while next_page:
55
+ json_data = {
56
+ **{"page": page_number, "size": _REPORTING_PAGE_SIZE},
57
+ **additional,
58
+ }
59
+ _check_results_page = self._call(
60
+ url=url,
61
+ method="POST",
62
+ data=json_data,
63
+ headers=self._reporting_headers,
64
+ )
65
+ yield from _check_results_page["data"]
66
+
67
+ next_page = len(_check_results_page["data"]) == _REPORTING_PAGE_SIZE
68
+ page_number += 1
69
+
70
+ def _datasets(self) -> Iterator[dict]:
71
+ url = SodaEndpointFactory.datasets()
72
+ next_page = True
73
+ page_number = 0
74
+ while next_page:
75
+ data = self._call(
76
+ url=url,
77
+ method="GET",
78
+ headers={},
79
+ params={"size": _CLOUD_PAGE_SIZE, "page": page_number},
80
+ auth=self._auth,
81
+ )
82
+ yield from data["content"]
83
+ next_page = not data["last"]
84
+ page_number += 1
85
+ sleep(_RATE_LIMIT_S)
86
+
87
+ def _check_results(self) -> Iterator:
88
+ url = SodaEndpointFactory.check_results()
89
+ _date = format_date(timestamp=yesterday())
90
+ return self._get_results_paginated(
91
+ url, additional={"from_datetime": _date}
92
+ )
93
+
94
+ def fetch(self, asset: SodaAsset) -> Iterator[dict]:
95
+ if asset == SodaAsset.DATASETS:
96
+ yield from self._datasets()
97
+ if asset == SodaAsset.CHECK_RESULTS:
98
+ yield from self._check_results()
99
+ raise ValueError(f"The asset {asset}, is not supported")
@@ -0,0 +1,28 @@
1
+ from typing import Dict
2
+
3
+ from pydantic import Field
4
+ from pydantic_settings import BaseSettings, SettingsConfigDict
5
+ from requests.auth import HTTPBasicAuth
6
+
7
+ SODA_ENV_PREFIX = "CASTOR_SODA_"
8
+
9
+
10
+ class SodaCredentials(BaseSettings):
11
+ """Class to handle Soda rest API permissions"""
12
+
13
+ model_config = SettingsConfigDict(
14
+ env_prefix=SODA_ENV_PREFIX,
15
+ extra="ignore",
16
+ populate_by_name=True,
17
+ )
18
+
19
+ api_key: str = Field(repr=False)
20
+ secret: str = Field(repr=False)
21
+
22
+ @property
23
+ def reporting_headers(self) -> Dict[str, str]:
24
+ return {
25
+ "Content-Type": "application/json",
26
+ "API_KEY_ID": self.api_key,
27
+ "API_KEY_SECRET": self.secret,
28
+ }
@@ -0,0 +1,13 @@
1
+ class SodaEndpointFactory:
2
+ """Wrapper class around all endpoints we're using"""
3
+
4
+ CLOUD_API = "https://cloud.soda.io/api/v1"
5
+ REPORTING_API = "https://reporting.cloud.soda.io/v1"
6
+
7
+ @classmethod
8
+ def datasets(cls) -> str:
9
+ return f"{cls.CLOUD_API}/datasets"
10
+
11
+ @classmethod
12
+ def check_results(cls) -> str:
13
+ return f"{cls.REPORTING_API}/quality/check_results"
castor_extractor/types.py CHANGED
@@ -50,6 +50,4 @@ class ExternalAsset(Enum):
50
50
  """
51
51
  Returns the assets that must always be provided.
52
52
  """
53
- return {
54
- item for item in cls if item not in cls.optional # type: ignore
55
- }
53
+ return {item for item in cls if item not in cls.optional} # type: ignore