PyPI - castor-extractor - Versions diffs - 0.21.9__py3-none-any.whl → 0.22.0__py3-none-any.whl - Mend

castor-extractor 0.21.9py3-none-any.whl → 0.22.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of castor-extractor might be problematic. Click here for more details.

Files changed (128) hide show

CHANGELOG.md CHANGED Viewed

@@ -1,6 +1,10 @@
 # Changelog
+## 0.22.0 - 2024-12-04
+* Stop supporting python3.8
 ## 0.21.9 - 2024-12-04
 * Tableau: fix handling of timeout retry

castor_extractor/commands/__init__.py CHANGED Viewed

@@ -1,3 +0,0 @@
-from castor_extractor.utils import deprecate_python  # type: ignore
-deprecate_python(min_version_supported=(3, 9))

castor_extractor/commands/file_check.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import logging
 import os
 from argparse import ArgumentParser
-from typing import Set
 from castor_extractor import file_checker  # type: ignore
 from castor_extractor.utils import (  # type: ignore
@@ -15,7 +14,7 @@ logger = logging.getLogger(__name__)
 WarehouseTemplate = file_checker.GenericWarehouseFileTemplate
-Ids = Set[str]
+Ids = set[str]
 _ID_KEY = "id"

castor_extractor/file_checker/column.py CHANGED Viewed

@@ -1,11 +1,11 @@
-from typing import Callable, Dict, Optional, Set
+from typing import Callable, Optional
 from dateutil.parser import parse
 from ..utils import string_to_tuple
 from .enums import DataType, Issue
-_CONVERTERS: Dict[DataType, Callable] = {
+_CONVERTERS: dict[DataType, Callable] = {
     DataType.DATETIME: parse,
     DataType.FLOAT: float,
     DataType.INTEGER: int,
@@ -29,13 +29,13 @@ class ColumnChecker:
         data_type: DataType = DataType.STRING,
         is_mandatory: bool = True,
         is_unique: bool = False,
-        foreign: Optional[Set[str]] = None,
-        enum_values: Optional[Set[str]] = None,
+        foreign: Optional[set[str]] = None,
+        enum_values: Optional[set[str]] = None,
     ):
         self.data_type = data_type
         self.is_mandatory = is_mandatory
         self.is_unique = is_unique
-        self.occurrences: Set[str] = set()
+        self.occurrences: set[str] = set()
         self.foreign = foreign
         self.enum_values = enum_values

castor_extractor/file_checker/file.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, Iterable, Iterator, Set
+from collections.abc import Iterable, Iterator
 from .column import ColumnChecker
 from .enums import Issue
@@ -8,8 +8,8 @@ logger = logging.getLogger(__name__)
 _SEPARATOR = f"{30 * '-'}\n"
-FileTemplate = Dict[str, ColumnChecker]  # column_name, column_checker
-IssueCounter = Dict[Issue, int]  # occurrences per type of issue
+FileTemplate = dict[str, ColumnChecker]  # column_name, column_checker
+IssueCounter = dict[Issue, int]  # occurrences per type of issue
 class FileCheckerResults:
@@ -22,7 +22,7 @@ class FileCheckerResults:
         self.total_rows: int = 0
         self.valid_rows: int = 0
         self.counter: IssueCounter = {issue: 0 for issue in Issue}
-        self.indices: Set[int] = set()
+        self.indices: set[int] = set()
     def summary(self) -> str:
         """
@@ -67,7 +67,7 @@ class FileCheckerRun:
     def __init__(
         self,
-        content: Iterable[Dict],
+        content: Iterable[dict],
         template: FileTemplate,
         file_name: str,
         verbose: bool = False,
@@ -128,7 +128,7 @@ class FileCheckerRun:
             header += f"{str(k):<20} {str(v):<100}\n"
         self.logger.info(header + _SEPARATOR + issue_log + _SEPARATOR)
-    def occurrences(self, name: str) -> Set[str]:
+    def occurrences(self, name: str) -> set[str]:
         """
         Return values of the given column, provided:
         - the column exists in the template
@@ -158,7 +158,7 @@ class FileCheckerRun:
         for _ in self.valid_rows():
             pass
-    def valid_rows(self) -> Iterator[Dict]:
+    def valid_rows(self) -> Iterator[dict]:
         """
         Reads the file content and yields only valid rows.
         - Invalid rows are ignored

castor_extractor/file_checker/file_test.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import csv
 import os
-from typing import Dict, Iterator
+from collections.abc import Iterator
 from .column import ColumnChecker
 from .enums import DataType, Issue
@@ -10,7 +10,7 @@ _TEST_FILE = "file_test_users.csv"
 _TEST_FILE_VALID = "file_test_users_valid.csv"
-def _content(path: str) -> Iterator[Dict]:
+def _content(path: str) -> Iterator[dict]:
     absolute_path = os.path.join(os.path.dirname(__file__), path)
     with open(absolute_path) as csvfile:
         yield from csv.DictReader(csvfile)

castor_extractor/file_checker/templates/generic_warehouse.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from typing import Set
 from ..column import ColumnChecker
 from ..constants import TABLE_TYPES
 from ..enums import DataType
@@ -23,7 +21,7 @@ class GenericWarehouseFileTemplate:
         }
     @staticmethod
-    def schema(database_ids: Set[str]) -> FileTemplate:
+    def schema(database_ids: set[str]) -> FileTemplate:
         return {
             "id": ColumnChecker(is_unique=True),
             "database_id": ColumnChecker(foreign=database_ids),
@@ -33,7 +31,7 @@ class GenericWarehouseFileTemplate:
         }
     @staticmethod
-    def table(schema_ids: Set[str]) -> FileTemplate:
+    def table(schema_ids: set[str]) -> FileTemplate:
         return {
             "id": ColumnChecker(is_unique=True),
             "schema_id": ColumnChecker(foreign=schema_ids),
@@ -44,7 +42,7 @@ class GenericWarehouseFileTemplate:
         }
     @staticmethod
-    def column(table_ids: Set[str]) -> FileTemplate:
+    def column(table_ids: set[str]) -> FileTemplate:
         return {
             "id": ColumnChecker(is_unique=True),
             "table_id": ColumnChecker(foreign=table_ids),
@@ -58,7 +56,7 @@ class GenericWarehouseFileTemplate:
         }
     @staticmethod
-    def query(database_ids: Set[str], user_ids: Set[str]) -> FileTemplate:
+    def query(database_ids: set[str], user_ids: set[str]) -> FileTemplate:
         return {
             "database_id": ColumnChecker(foreign=database_ids),
             "query_text": ColumnChecker(),

castor_extractor/knowledge/confluence/client/client.py CHANGED Viewed

@@ -1,6 +1,7 @@
+from collections.abc import Iterator
 from functools import partial
 from http import HTTPStatus
-from typing import Iterator, Optional
+from typing import Optional
 from ....utils import (
     APIClient,

castor_extractor/knowledge/confluence/extract.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
-from typing import Iterable, Iterator, Tuple, Union
+from collections.abc import Iterable, Iterator
+from typing import Union
 from ...utils import (
     OUTPUT_DIR,
@@ -18,7 +19,7 @@ logger = logging.getLogger(__name__)
 def iterate_all_data(
     client: ConfluenceClient,
-) -> Iterable[Tuple[ConfluenceAsset, Union[list, Iterator, dict]]]:
+) -> Iterable[tuple[ConfluenceAsset, Union[list, Iterator, dict]]]:
     """Iterate over the extracted data from Confluence"""
     logger.info("Extracting USERS from API")

castor_extractor/knowledge/notion/client/client.py CHANGED Viewed

@@ -1,6 +1,7 @@
+from collections.abc import Iterator
 from functools import partial
 from http import HTTPStatus
-from typing import Dict, Iterator, Optional
+from typing import Optional
 from ....utils import APIClient, BearerAuth, RequestSafeMode, fetch_all_pages
 from ..assets import NotionAsset
@@ -23,7 +24,7 @@ NOTION_BASE_HEADERS = {
 NOTION_DEFAULT_TIMEOUT_S = 180
-def _search_filter(asset: str) -> Dict[str, Dict[str, str]]:
+def _search_filter(asset: str) -> dict[str, dict[str, str]]:
     return {"filter": {"value": asset, "property": "object"}}

castor_extractor/knowledge/notion/extract.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
-from typing import Iterable, Iterator, Tuple, Union
+from collections.abc import Iterable, Iterator
+from typing import Union
 from ...utils import (
     OUTPUT_DIR,
@@ -18,7 +19,7 @@ logger = logging.getLogger(__name__)
 def iterate_all_data(
     client: NotionClient,
-) -> Iterable[Tuple[NotionAsset, Union[list, Iterator, dict]]]:
+) -> Iterable[tuple[NotionAsset, Union[list, Iterator, dict]]]:
     """Iterate over the extracted data from Notion"""
     logger.info("Extracting USERS from API")

castor_extractor/quality/soda/client/client.py CHANGED Viewed

@@ -1,5 +1,6 @@
+from collections.abc import Iterator
 from functools import partial
-from typing import Callable, Iterator
+from typing import Callable
 from ....utils import (
     APIClient,

castor_extractor/quality/soda/client/pagination.py CHANGED Viewed

@@ -1,12 +1,10 @@
-from typing import List
 from ....utils import PaginationModel
 _CLOUD_FIRST_PAGE = 0
 class SodaCloudPagination(PaginationModel):
-    content: List[dict]
+    content: list[dict]
     last: bool
     def is_last(self) -> bool:

castor_extractor/types.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from abc import ABC
 from enum import Enum
-from typing import Literal, Set, TypedDict
+from typing import Literal, TypedDict
 class CsvOptions(TypedDict):
@@ -34,7 +34,7 @@ class ExternalAsset(Enum):
     __metaclass__ = ABC
     @classproperty
-    def optional(cls) -> Set["ExternalAsset"]:
+    def optional(cls) -> set["ExternalAsset"]:
         """
         Returns the assets that are not necessarily extracted/pushed.
         Example:
@@ -46,7 +46,7 @@ class ExternalAsset(Enum):
         return set()
     @classproperty
-    def mandatory(cls) -> Set["ExternalAsset"]:
+    def mandatory(cls) -> set["ExternalAsset"]:
         """
         Returns the assets that must always be provided.
         """

castor_extractor/uploader/env.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import os
-from typing import Optional, Tuple
+from typing import Optional
 from .constant import (
     DEFAULT_RETRY,
@@ -27,7 +27,7 @@ def _parse_int(value: Optional[str], default: int) -> int:
         return default
-def get_blob_env() -> Tuple[float, int]:
+def get_blob_env() -> tuple[float, int]:
     """
     Retrieve timeout and retries values. It look for environment variables
     first and return default value otherwise

castor_extractor/uploader/upload.py CHANGED Viewed

@@ -1,8 +1,9 @@
 #!/usr/bin/env python3
 import logging
 import ntpath
+from collections.abc import Iterable
 from datetime import datetime
-from typing import Dict, Iterable, Optional, Tuple
+from typing import Optional
 from uuid import UUID
 import requests
@@ -32,7 +33,7 @@ def _path_and_url(
     source_id: UUID,
     file_type: FileType,
     file_path: str,
-) -> Tuple[str, str]:
+) -> tuple[str, str]:
     now = datetime.utcnow()
     timestamp = int(now.timestamp())
     filename = ntpath.basename(file_path)
@@ -48,7 +49,7 @@ def _path_and_url(
     return path, url
-def _headers(token: str) -> Dict:
+def _headers(token: str) -> dict:
     return {
         "Authorization": f"Token {token}",
         "Accept": "text/csv, application/json",

castor_extractor/uploader/utils.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 import os
-from typing import Iterator
+from collections.abc import Iterator
 logger = logging.getLogger(__name__)

castor_extractor/utils/client/abstract.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
-from typing import Iterator, Optional, cast
+from collections.abc import Iterator
+from typing import Optional, cast
 from sqlalchemy import text
 from sqlalchemy.engine import Connection, ResultProxy, create_engine

castor_extractor/utils/client/api/auth.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from abc import ABC, abstractmethod
-from typing import Dict, Optional, Union
+from typing import Optional, Union
 from requests.auth import AuthBase, HTTPBasicAuth
@@ -33,7 +33,7 @@ class CustomAuth(AuthBase, ABC):
         pass
     @abstractmethod
-    def _authentication_header(self) -> Dict[str, str]:
+    def _authentication_header(self) -> dict[str, str]:
         pass
     def __call__(self, r):

castor_extractor/utils/client/api/auth_test.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, Optional
+from typing import Optional
 from .auth import BasicAuth, BearerAuth, CustomAuth
@@ -9,7 +9,7 @@ class _MockRequest:
 class _CustomAuth(CustomAuth):
-    def _authentication_header(self) -> Dict[str, str]:
+    def _authentication_header(self) -> dict[str, str]:
         return {"custom-token": "token"}

castor_extractor/utils/client/api/client.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from http import HTTPStatus
-from typing import Callable, Dict, Literal, Optional, Tuple
+from typing import Callable, Literal, Optional
 import requests
 from requests import Response
@@ -12,7 +12,7 @@ from .utils import build_url
 logger = logging.getLogger(__name__)
-Headers = Optional[Dict[str, str]]
+Headers = Optional[dict[str, str]]
 # https://requests.readthedocs.io/en/latest/api/#requests.request
 HttpMethod = Literal["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
@@ -27,7 +27,7 @@ def _generate_payloads(
     params: Optional[dict],
     data: Optional[dict],
     pagination_params: Optional[dict],
-) -> Tuple[Optional[dict], Optional[dict]]:
+) -> tuple[Optional[dict], Optional[dict]]:
     _pagination_params = pagination_params or {}
     if method == "GET":

castor_extractor/utils/client/api/pagination.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import logging
 from abc import abstractmethod
+from collections.abc import Iterator
 from enum import Enum
 from functools import partial
 from time import sleep
-from typing import Callable, Iterator, Optional, Type, Union
+from typing import Callable, Optional, Union
 from pydantic import BaseModel
@@ -56,7 +57,7 @@ class PaginationModel(BaseModel):
 def fetch_all_pages(
     request: Callable,
-    pagination_model: Type[PaginationModel],
+    pagination_model: type[PaginationModel],
     rate_limit: Optional[int] = None,
 ) -> Iterator:
     """

castor_extractor/utils/client/api/safe_request.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import logging
-from typing import Any, Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Union
 from requests import HTTPError, Response
 logger = logging.getLogger(__name__)
-ResponseJson = Union[dict, List[dict]]
+ResponseJson = Union[dict, list[dict]]
 class RequestSafeMode:
@@ -21,11 +21,11 @@ class RequestSafeMode:
     def __init__(
         self,
         max_errors: Union[int, float] = 0,
-        status_codes: Tuple[int, ...] = (),
+        status_codes: tuple[int, ...] = (),
     ):
         self.max_errors = max_errors
-        self.status_codes: List[int] = list(status_codes)
-        self.status_codes_caught: List[int] = []
+        self.status_codes: list[int] = list(status_codes)
+        self.status_codes_caught: list[int] = []
     def catch_response(self, exception: HTTPError, status_code: int):
         if int(status_code) not in self.status_codes:

castor_extractor/utils/collection.py CHANGED Viewed

@@ -1,11 +1,7 @@
 from collections import defaultdict
+from collections.abc import Iterable, Sequence
 from typing import (
     Any,
-    Dict,
-    Iterable,
-    List,
-    Sequence,
-    Set,
     TypeVar,
 )
@@ -15,9 +11,9 @@ from .type import Getter
 T = TypeVar("T")
-def group_by(identifier: Getter, elements: Sequence) -> Dict[Any, List]:
+def group_by(identifier: Getter, elements: Sequence) -> dict[Any, list]:
     """Groups the elements by the given key"""
-    groups: Dict[Any, List] = defaultdict(list)
+    groups: dict[Any, list] = defaultdict(list)
     for element in elements:
         key = getproperty(element, identifier)
         groups[key].append(element)
@@ -25,7 +21,7 @@ def group_by(identifier: Getter, elements: Sequence) -> Dict[Any, List]:
     return groups
-def mapping_from_rows(rows: List[Dict], key: Any, value: Any) -> Dict:
+def mapping_from_rows(rows: list[dict], key: Any, value: Any) -> dict:
     """
     Create a dictionary mapping from a list of dictionaries using specified keys for mapping.
@@ -68,13 +64,13 @@ def empty_iterator():
 def deduplicate(
     identifier: Getter,
     elements: Iterable[T],
-) -> List[T]:
+) -> list[T]:
     """
     Remove duplicates in the given elements, using the specified identifier
     Only the first occurrence is kept.
     """
-    deduplicated: List[T] = []
-    processed: Set[Any] = set()
+    deduplicated: list[T] = []
+    processed: set[Any] = set()
     for element in elements:
         key = getproperty(element, identifier)

castor_extractor/utils/dbt/client.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 from dataclasses import dataclass
 from datetime import datetime
 from enum import Enum
-from typing import Literal, Optional, Set, Tuple
+from typing import Literal, Optional
 import requests
 from dateutil.parser import parse
@@ -92,7 +92,7 @@ class DbtClient:
         result = self._call(url=self._account_url)
         return result[0]["id"]
-    def list_job_identifiers(self) -> Set[int]:
+    def list_job_identifiers(self) -> set[int]:
         """
         Return the IDs of all non-deleted jobs for this account
         https://docs.getdbt.com/dbt-cloud/api-v2-legacy#tag/Jobs/operation/listJobsForAccount
@@ -104,7 +104,7 @@ class DbtClient:
     def last_run(
         self,
         job_id: Optional[int] = None,
-        finished_at_range: Optional[Tuple[datetime, datetime]] = None,
+        finished_at_range: Optional[tuple[datetime, datetime]] = None,
     ) -> Optional[DbtRun]:
         """
         Extract the last successful run id, optionally filtered on a given datetime range

castor_extractor/utils/dbt/client_test.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 from datetime import datetime, timedelta
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 from unittest.mock import MagicMock, patch
 import pytest
@@ -21,7 +21,7 @@ _RECENT_DATE_STR = "2023-10-06 05:09:31.731991+00:00"
 def _assert_called_with(
     mocked_call: MagicMock,
     job_id: Union[int, str],
-    date_range: Optional[Tuple[datetime, datetime]] = None,
+    date_range: Optional[tuple[datetime, datetime]] = None,
 ) -> None:
     url = "https://cloud.getdbt.com/api/v2/accounts/40/runs/"
     params = {

castor_extractor/utils/deprecate.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import logging
 import sys
 import warnings
-from typing import Tuple
-def deprecate_python(min_version_supported: Tuple[int, ...]):
+def deprecate_python(min_version_supported: tuple[int, ...]):
     """raises a warning if python version < min_version_supported"""
     python_version = (

castor_extractor/utils/files.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import glob
 import os
-from typing import List, Optional, Set, Tuple
+from typing import Optional
-def explode(path: str) -> Tuple[str, str, str]:
+def explode(path: str) -> tuple[str, str, str]:
     """
     Split a file path into 3 parts:
     - Head (directory)
@@ -20,9 +20,9 @@ def search_files(
     directory: str,
     *,
     filter_endswith: Optional[str] = None,
-    filter_extensions: Optional[Set[str]] = None,
-    does_not_contain: Optional[Set[str]] = None,
-) -> List[str]:
+    filter_extensions: Optional[set[str]] = None,
+    does_not_contain: Optional[set[str]] = None,
+) -> list[str]:
     """Retrieve files in a directory, matching given criteria"""
     def _does_not_contain(path: str) -> bool:

castor_extractor/utils/formatter.py CHANGED Viewed

@@ -6,9 +6,10 @@ import logging
 import re
 import sys
 from abc import ABC, abstractmethod
+from collections.abc import Iterable, Iterator, Sequence
 from datetime import date, datetime
 from enum import Enum
-from typing import IO, Any, Iterable, Iterator, List, Sequence, Union
+from typing import IO, Any, Union
 from uuid import UUID
 from ..types import CsvOptions
@@ -49,7 +50,7 @@ def _scalar(value: Any) -> ScalarValue:
     return str(value)
-def _row(header: Sequence[str], row: dict) -> List[ScalarValue]:
+def _row(header: Sequence[str], row: dict) -> list[ScalarValue]:
     return [_scalar(row.get(h)) for h in header]
@@ -60,7 +61,7 @@ def remove_unsupported_byte(element: ScalarValue) -> ScalarValue:
     return re.sub("\x00", "", element)
-def to_string_array(arr_json: str) -> List[str]:
+def to_string_array(arr_json: str) -> list[str]:
     """
     Converts a JSON-serialized string array value as a string to a list
     Ex: '["items","count"]' to ["items", "order"]
@@ -100,7 +101,7 @@ def from_csv(buffer: IO[str]) -> Iterator[dict]:
     """convert data as from a CSV string to list of dict"""
     try:
         reader = csv.reader(buffer, **CSV_OPTIONS)
-        header: List[str] = []
+        header: list[str] = []
         for row in reader:
             if not header:
                 header = list(row)

castor_extractor/utils/json_stream_write.py CHANGED Viewed

@@ -21,7 +21,8 @@ SOFTWARE.
 """
 from collections import deque
-from typing import Any, Iterable, Iterator
+from collections.abc import Iterable, Iterator
+from typing import Any
 class Streamable:

castor_extractor/utils/object.py CHANGED Viewed

@@ -1,6 +1,7 @@
+from collections.abc import Iterator
 from datetime import date, datetime
 from enum import Enum
-from typing import Any, Iterator, Union, overload
+from typing import Any, Union, overload
 from uuid import UUID
 from .type import Getter

castor-extractor 0.21.9__py3-none-any.whl → 0.22.0__py3-none-any.whl

Potentially problematic release.

castor-extractor 0.21.9py3-none-any.whl → 0.22.0py3-none-any.whl