castor-extractor 0.21.9__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (128) hide show
  1. CHANGELOG.md +4 -0
  2. castor_extractor/commands/__init__.py +0 -3
  3. castor_extractor/commands/file_check.py +1 -2
  4. castor_extractor/file_checker/column.py +5 -5
  5. castor_extractor/file_checker/file.py +7 -7
  6. castor_extractor/file_checker/file_test.py +2 -2
  7. castor_extractor/file_checker/templates/generic_warehouse.py +4 -6
  8. castor_extractor/knowledge/confluence/client/client.py +2 -1
  9. castor_extractor/knowledge/confluence/extract.py +3 -2
  10. castor_extractor/knowledge/notion/client/client.py +3 -2
  11. castor_extractor/knowledge/notion/extract.py +3 -2
  12. castor_extractor/quality/soda/client/client.py +2 -1
  13. castor_extractor/quality/soda/client/pagination.py +1 -3
  14. castor_extractor/types.py +3 -3
  15. castor_extractor/uploader/env.py +2 -2
  16. castor_extractor/uploader/upload.py +4 -3
  17. castor_extractor/uploader/utils.py +1 -1
  18. castor_extractor/utils/client/abstract.py +2 -1
  19. castor_extractor/utils/client/api/auth.py +2 -2
  20. castor_extractor/utils/client/api/auth_test.py +2 -2
  21. castor_extractor/utils/client/api/client.py +3 -3
  22. castor_extractor/utils/client/api/pagination.py +3 -2
  23. castor_extractor/utils/client/api/safe_request.py +5 -5
  24. castor_extractor/utils/collection.py +7 -11
  25. castor_extractor/utils/dbt/client.py +3 -3
  26. castor_extractor/utils/dbt/client_test.py +2 -2
  27. castor_extractor/utils/deprecate.py +1 -2
  28. castor_extractor/utils/files.py +5 -5
  29. castor_extractor/utils/formatter.py +5 -4
  30. castor_extractor/utils/json_stream_write.py +2 -1
  31. castor_extractor/utils/object.py +2 -1
  32. castor_extractor/utils/pager/pager.py +2 -4
  33. castor_extractor/utils/pager/pager_on_id.py +2 -1
  34. castor_extractor/utils/pager/pager_on_id_test.py +5 -5
  35. castor_extractor/utils/pager/pager_test.py +3 -3
  36. castor_extractor/utils/retry.py +4 -3
  37. castor_extractor/utils/retry_test.py +2 -3
  38. castor_extractor/utils/safe.py +3 -3
  39. castor_extractor/utils/salesforce/client.py +2 -1
  40. castor_extractor/utils/salesforce/credentials.py +1 -3
  41. castor_extractor/utils/store.py +2 -1
  42. castor_extractor/utils/string.py +2 -2
  43. castor_extractor/utils/string_test.py +1 -3
  44. castor_extractor/utils/type.py +3 -2
  45. castor_extractor/utils/validation.py +4 -4
  46. castor_extractor/utils/write.py +2 -2
  47. castor_extractor/visualization/domo/client/client.py +8 -7
  48. castor_extractor/visualization/domo/client/credentials.py +2 -2
  49. castor_extractor/visualization/domo/client/endpoints.py +2 -2
  50. castor_extractor/visualization/domo/extract.py +3 -2
  51. castor_extractor/visualization/looker/api/client.py +17 -16
  52. castor_extractor/visualization/looker/api/utils.py +2 -2
  53. castor_extractor/visualization/looker/assets.py +1 -3
  54. castor_extractor/visualization/looker/extract.py +4 -3
  55. castor_extractor/visualization/looker/fields.py +3 -3
  56. castor_extractor/visualization/looker/multithreading.py +3 -3
  57. castor_extractor/visualization/metabase/assets.py +1 -3
  58. castor_extractor/visualization/metabase/client/api/client.py +8 -7
  59. castor_extractor/visualization/metabase/extract.py +3 -2
  60. castor_extractor/visualization/metabase/types.py +1 -3
  61. castor_extractor/visualization/mode/client/client.py +6 -6
  62. castor_extractor/visualization/mode/extract.py +2 -2
  63. castor_extractor/visualization/powerbi/assets.py +1 -3
  64. castor_extractor/visualization/powerbi/client/client.py +12 -11
  65. castor_extractor/visualization/powerbi/client/credentials.py +3 -3
  66. castor_extractor/visualization/powerbi/client/endpoints.py +2 -2
  67. castor_extractor/visualization/powerbi/extract.py +3 -2
  68. castor_extractor/visualization/qlik/assets.py +1 -3
  69. castor_extractor/visualization/qlik/client/constants.py +1 -3
  70. castor_extractor/visualization/qlik/client/engine/error.py +1 -3
  71. castor_extractor/visualization/qlik/client/master.py +3 -3
  72. castor_extractor/visualization/qlik/client/rest.py +12 -12
  73. castor_extractor/visualization/qlik/extract.py +4 -3
  74. castor_extractor/visualization/salesforce_reporting/client/rest.py +3 -2
  75. castor_extractor/visualization/salesforce_reporting/client/soql.py +1 -3
  76. castor_extractor/visualization/salesforce_reporting/extract.py +3 -2
  77. castor_extractor/visualization/sigma/client/client.py +9 -8
  78. castor_extractor/visualization/sigma/client/credentials.py +1 -3
  79. castor_extractor/visualization/sigma/extract.py +3 -2
  80. castor_extractor/visualization/tableau/assets.py +1 -2
  81. castor_extractor/visualization/tableau/client/client.py +1 -2
  82. castor_extractor/visualization/tableau/client/client_utils.py +3 -2
  83. castor_extractor/visualization/tableau/client/credentials.py +3 -3
  84. castor_extractor/visualization/tableau/client/safe_mode.py +1 -2
  85. castor_extractor/visualization/tableau/extract.py +2 -2
  86. castor_extractor/visualization/tableau/gql_fields.py +3 -3
  87. castor_extractor/visualization/tableau/tsc_fields.py +1 -2
  88. castor_extractor/visualization/tableau/types.py +3 -3
  89. castor_extractor/visualization/tableau_revamp/client/client_metadata_api.py +3 -2
  90. castor_extractor/visualization/tableau_revamp/client/client_rest_api.py +3 -3
  91. castor_extractor/visualization/tableau_revamp/client/client_tsc.py +3 -2
  92. castor_extractor/visualization/tableau_revamp/client/gql_queries.py +1 -3
  93. castor_extractor/visualization/tableau_revamp/client/rest_fields.py +1 -3
  94. castor_extractor/visualization/tableau_revamp/extract.py +2 -2
  95. castor_extractor/visualization/thoughtspot/client/client.py +3 -2
  96. castor_extractor/visualization/thoughtspot/client/utils.py +1 -1
  97. castor_extractor/visualization/thoughtspot/extract.py +3 -2
  98. castor_extractor/warehouse/abstract/asset.py +4 -5
  99. castor_extractor/warehouse/abstract/extract.py +4 -3
  100. castor_extractor/warehouse/abstract/query.py +4 -4
  101. castor_extractor/warehouse/bigquery/client.py +8 -8
  102. castor_extractor/warehouse/bigquery/extract.py +1 -1
  103. castor_extractor/warehouse/bigquery/query.py +2 -2
  104. castor_extractor/warehouse/bigquery/types.py +2 -4
  105. castor_extractor/warehouse/databricks/api_client.py +15 -14
  106. castor_extractor/warehouse/databricks/client.py +16 -16
  107. castor_extractor/warehouse/databricks/extract.py +4 -4
  108. castor_extractor/warehouse/databricks/format.py +12 -12
  109. castor_extractor/warehouse/databricks/lineage.py +11 -11
  110. castor_extractor/warehouse/databricks/pagination.py +2 -2
  111. castor_extractor/warehouse/databricks/types.py +4 -4
  112. castor_extractor/warehouse/databricks/utils.py +5 -4
  113. castor_extractor/warehouse/mysql/query.py +2 -2
  114. castor_extractor/warehouse/postgres/query.py +2 -2
  115. castor_extractor/warehouse/redshift/client.py +1 -1
  116. castor_extractor/warehouse/redshift/query.py +2 -2
  117. castor_extractor/warehouse/salesforce/client.py +8 -8
  118. castor_extractor/warehouse/salesforce/extract.py +3 -4
  119. castor_extractor/warehouse/salesforce/format.py +8 -7
  120. castor_extractor/warehouse/salesforce/format_test.py +2 -4
  121. castor_extractor/warehouse/snowflake/query.py +5 -5
  122. castor_extractor/warehouse/sqlserver/client.py +1 -1
  123. castor_extractor/warehouse/sqlserver/query.py +2 -2
  124. {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/METADATA +7 -6
  125. {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/RECORD +128 -128
  126. {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/LICENCE +0 -0
  127. {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/WHEEL +0 -0
  128. {castor_extractor-0.21.9.dist-info → castor_extractor-0.22.0.dist-info}/entry_points.txt +0 -0
@@ -1,4 +1,5 @@
1
- from typing import Dict, Iterator, Optional
1
+ from collections.abc import Iterator
2
+ from typing import Optional
2
3
 
3
4
  from ....utils import SerializedAsset
4
5
  from ..assets import TableauAsset
@@ -69,6 +70,6 @@ def query_scroll(
69
70
  break
70
71
 
71
72
 
72
- def extract_asset(asset: Dict, asset_type: TableauAsset) -> Dict:
73
+ def extract_asset(asset: dict, asset_type: TableauAsset) -> dict:
73
74
  """Agnostic function extracting dedicated attributes with define asset"""
74
75
  return {key: getattr(asset, key) for key in TSC_FIELDS[asset_type]}
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import Dict, Optional
2
+ from typing import Optional
3
3
 
4
4
  from ....utils import from_env
5
5
 
@@ -20,7 +20,7 @@ class CredentialsKey(Enum):
20
20
  TABLEAU_SERVER_URL = "server_url"
21
21
 
22
22
 
23
- CREDENTIALS_ENV: Dict[CredentialsKey, str] = {
23
+ CREDENTIALS_ENV: dict[CredentialsKey, str] = {
24
24
  CredentialsKey.TABLEAU_USER: "CASTOR_TABLEAU_USER",
25
25
  CredentialsKey.TABLEAU_PASSWORD: "CASTOR_TABLEAU_PASSWORD",
26
26
  CredentialsKey.TABLEAU_TOKEN_NAME: "CASTOR_TABLEAU_TOKEN_NAME",
@@ -89,7 +89,7 @@ class CredentialsApi:
89
89
  CredentialsKey.TABLEAU_TOKEN: token,
90
90
  }
91
91
 
92
- def to_dict(self, hide: bool = False) -> Dict[str, str]:
92
+ def to_dict(self, hide: bool = False) -> dict[str, str]:
93
93
  safe = (
94
94
  CredentialsKey.TABLEAU_USER,
95
95
  CredentialsKey.TABLEAU_SITE_ID,
@@ -1,5 +1,4 @@
1
1
  import logging
2
- from typing import Dict, List
3
2
 
4
3
  import tableauserverclient as TSC # type: ignore
5
4
 
@@ -48,7 +47,7 @@ def safe_mode_fetch_usage(client) -> SerializedAsset:
48
47
  Returns computed usages when page number is not found
49
48
  Log errors if ServerResponseError is return
50
49
  """
51
- list_usages: List[Dict] = []
50
+ list_usages: list[dict] = []
52
51
  page_number: int = 0
53
52
 
54
53
  while True:
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Iterable, Tuple
2
+ from collections.abc import Iterable
3
3
 
4
4
  from ...utils import (
5
5
  OUTPUT_DIR,
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
19
19
 
20
20
  def iterate_all_data(
21
21
  client: Client,
22
- ) -> Iterable[Tuple[TableauAsset, list]]:
22
+ ) -> Iterable[tuple[TableauAsset, list]]:
23
23
  """Iterate over the extracted Data from Tableau"""
24
24
 
25
25
  logger.info("Extracting USER from Tableau API")
@@ -1,6 +1,6 @@
1
1
  # Fields which will be use for Tableau GraphQL API
2
2
  from enum import Enum
3
- from typing import Dict, List, Union
3
+ from typing import Union
4
4
 
5
5
  from .assets import TableauAsset, TableauGraphqlAsset
6
6
 
@@ -189,9 +189,9 @@ class GQLQueryFields(Enum):
189
189
  """
190
190
 
191
191
 
192
- QueryInfo = List[Dict[str, Union[GQLQueryFields, TableauGraphqlAsset]]]
192
+ QueryInfo = list[dict[str, Union[GQLQueryFields, TableauGraphqlAsset]]]
193
193
 
194
- QUERY_FIELDS: Dict[TableauAsset, QueryInfo] = {
194
+ QUERY_FIELDS: dict[TableauAsset, QueryInfo] = {
195
195
  TableauAsset.CUSTOM_SQL_TABLE: [
196
196
  {
197
197
  FIELDS: GQLQueryFields.CUSTOM_SQL_TABLE,
@@ -1,10 +1,9 @@
1
1
  # TSC for TableauServerClient: basic REST API to extracting core objects
2
- from typing import Dict, Set
3
2
 
4
3
  from .assets import TableauAsset
5
4
 
6
5
  # TSC fields extracted per assets
7
- TSC_FIELDS: Dict[TableauAsset, Set[str]] = {
6
+ TSC_FIELDS: dict[TableauAsset, set[str]] = {
8
7
  TableauAsset.PROJECT: {
9
8
  "id",
10
9
  "name",
@@ -1,4 +1,4 @@
1
- from typing import Dict, List, Tuple, Union
1
+ from typing import Union
2
2
 
3
3
  from tableauserverclient import ServerResponseError # type: ignore
4
4
  from typing_extensions import Literal
@@ -6,6 +6,6 @@ from typing_extensions import Literal
6
6
  from .errors import TableauErrorCode
7
7
 
8
8
  PageReturn = Union[
9
- Tuple[List[Dict], Literal[None]],
10
- Tuple[Literal[None], Union[TableauErrorCode, ServerResponseError]],
9
+ tuple[list[dict], Literal[None]],
10
+ tuple[Literal[None], Union[TableauErrorCode, ServerResponseError]],
11
11
  ]
@@ -1,4 +1,5 @@
1
- from typing import Dict, Iterator, Optional
1
+ from collections.abc import Iterator
2
+ from typing import Optional
2
3
 
3
4
  import tableauserverclient as TSC # type: ignore
4
5
 
@@ -10,7 +11,7 @@ from .gql_queries import FIELDS_QUERIES, GQL_QUERIES, QUERY_TEMPLATE
10
11
 
11
12
  # increase the value when extraction is too slow
12
13
  # decrease the value when timeouts arise
13
- _CUSTOM_PAGE_SIZE: Dict[TableauRevampAsset, int] = {
14
+ _CUSTOM_PAGE_SIZE: dict[TableauRevampAsset, int] = {
14
15
  # for some clients, extraction of columns tend to hit the node limit
15
16
  # https://community.tableau.com/s/question/0D54T00000YuK60SAF/metadata-query-nodelimitexceeded-error
16
17
  # the workaround is to reduce pagination
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Dict, Optional
2
+ from typing import Optional
3
3
 
4
4
  import requests
5
5
  import tableauserverclient as TSC # type: ignore
@@ -40,7 +40,7 @@ class TableauClientRestApi:
40
40
  return self._server.http_options["timeout"]
41
41
 
42
42
  @property
43
- def headers(self) -> Dict[str, str]:
43
+ def headers(self) -> dict[str, str]:
44
44
  return {"x-tableau-auth": self._server.auth_token}
45
45
 
46
46
  def _get_site_name(self) -> str:
@@ -52,7 +52,7 @@ class TableauClientRestApi:
52
52
  self,
53
53
  url: str,
54
54
  page_token: Optional[str] = None,
55
- ) -> Dict:
55
+ ) -> dict:
56
56
  if page_token:
57
57
  url += f"?page_token={page_token}"
58
58
 
@@ -1,4 +1,5 @@
1
- from typing import Any, Dict, Iterable, Iterator
1
+ from collections.abc import Iterable, Iterator
2
+ from typing import Any
2
3
 
3
4
  import tableauserverclient as TSC # type: ignore
4
5
 
@@ -30,7 +31,7 @@ class TableauClientTSC:
30
31
  self,
31
32
  data: Iterable,
32
33
  asset: TableauRevampAsset,
33
- ) -> Iterator[Dict]:
34
+ ) -> Iterator[dict]:
34
35
  keys = REST_FIELDS[asset]
35
36
 
36
37
  for row in data:
@@ -1,5 +1,3 @@
1
- from typing import Dict, Tuple
2
-
3
1
  from ..assets import TableauRevampAsset
4
2
 
5
3
  QUERY_TEMPLATE = """
@@ -130,7 +128,7 @@ workbook { id }
130
128
  """
131
129
 
132
130
 
133
- GQL_QUERIES: Dict[TableauRevampAsset, Tuple[str, str]] = {
131
+ GQL_QUERIES: dict[TableauRevampAsset, tuple[str, str]] = {
134
132
  TableauRevampAsset.COLUMN: ("columns", _COLUMNS_QUERY),
135
133
  TableauRevampAsset.DASHBOARD: ("dashboards", _DASHBOARDS_QUERY),
136
134
  TableauRevampAsset.DATASOURCE: ("datasources", _DATASOURCES_QUERY),
@@ -1,9 +1,7 @@
1
- from typing import Dict, Set
2
-
3
1
  from ..assets import TableauRevampAsset
4
2
 
5
3
  # list of fields to pick in REST API or TSC responses
6
- REST_FIELDS: Dict[TableauRevampAsset, Set[str]] = {
4
+ REST_FIELDS: dict[TableauRevampAsset, set[str]] = {
7
5
  TableauRevampAsset.DATASOURCE: {
8
6
  "id",
9
7
  "project_id",
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Iterable, Tuple
2
+ from collections.abc import Iterable
3
3
 
4
4
  from ...utils import (
5
5
  OUTPUT_DIR,
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
18
18
 
19
19
  def iterate_all_data(
20
20
  client: TableauRevampClient,
21
- ) -> Iterable[Tuple[TableauRevampAsset, list]]:
21
+ ) -> Iterable[tuple[TableauRevampAsset, list]]:
22
22
  """Iterate over the extracted Data from Tableau"""
23
23
 
24
24
  for asset in TableauRevampAsset:
@@ -1,4 +1,5 @@
1
- from typing import Dict, Iterator, Optional
1
+ from collections.abc import Iterator
2
+ from typing import Optional
2
3
 
3
4
  import requests
4
5
 
@@ -35,7 +36,7 @@ THOUGHTSPOT_SAFE_MODE = RequestSafeMode()
35
36
 
36
37
 
37
38
  class ThoughtspotBearerAuth(BearerAuth):
38
- def __init__(self, host: str, token_payload: Dict[str, str]):
39
+ def __init__(self, host: str, token_payload: dict[str, str]):
39
40
  auth_endpoint = ThoughtspotEndpointFactory.authentication()
40
41
  self.authentication_url = build_url(host, auth_endpoint)
41
42
  self.token_payload = token_payload
@@ -1,6 +1,6 @@
1
1
  import csv
2
+ from collections.abc import Iterator
2
3
  from io import StringIO
3
- from typing import Iterator
4
4
 
5
5
 
6
6
  def usage_liveboard_reader(usage_liveboard_csv: str) -> Iterator[dict]:
@@ -1,5 +1,6 @@
1
1
  import logging
2
- from typing import Iterable, Iterator, Tuple, Union
2
+ from collections.abc import Iterable, Iterator
3
+ from typing import Union
3
4
 
4
5
  from ...utils import (
5
6
  OUTPUT_DIR,
@@ -21,7 +22,7 @@ logger = logging.getLogger(__name__)
21
22
 
22
23
  def iterate_all_data(
23
24
  client: ThoughtspotClient,
24
- ) -> Iterable[Tuple[ThoughtspotAsset, Union[list, Iterator, dict]]]:
25
+ ) -> Iterable[tuple[ThoughtspotAsset, Union[list, Iterator, dict]]]:
25
26
  """Iterate over the extracted data from Thoughtspot"""
26
27
 
27
28
  for asset in ThoughtspotAsset:
@@ -1,5 +1,4 @@
1
1
  from enum import Enum
2
- from typing import Dict, List, Set, Tuple
3
2
 
4
3
  from ...types import ExternalAsset, classproperty
5
4
 
@@ -26,7 +25,7 @@ class WarehouseAsset(ExternalAsset):
26
25
  VIEW_DDL = "view_ddl"
27
26
 
28
27
  @classproperty
29
- def optional(cls) -> Set["WarehouseAsset"]:
28
+ def optional(cls) -> set["WarehouseAsset"]:
30
29
  return {
31
30
  WarehouseAsset.ADDITIONAL_COLUMN_LINEAGE,
32
31
  WarehouseAsset.ADDITIONAL_TABLE_LINEAGE,
@@ -50,7 +49,7 @@ class WarehouseAssetGroup(Enum):
50
49
 
51
50
 
52
51
  # tuple of supported assets for each group (depends on the technology)
53
- SupportedAssets = Dict[WarehouseAssetGroup, Tuple[WarehouseAsset, ...]]
52
+ SupportedAssets = dict[WarehouseAssetGroup, tuple[WarehouseAsset, ...]]
54
53
 
55
54
  # shared by all technologies
56
55
  CATALOG_ASSETS = (
@@ -80,13 +79,13 @@ NON_EXTRACTABLE_ASSETS = {WarehouseAssetGroup.EXTERNAL_LINEAGE}
80
79
 
81
80
  def extractable_asset_groups(
82
81
  supported_assets: SupportedAssets,
83
- ) -> List[Tuple[WarehouseAsset, ...]]:
82
+ ) -> list[tuple[WarehouseAsset, ...]]:
84
83
  """
85
84
  helper function to differentiate
86
85
  extractable assets vs supported (ingest-able) assets
87
86
  """
88
87
  groups = set(supported_assets).difference(NON_EXTRACTABLE_ASSETS)
89
- extractable: Set[Tuple[WarehouseAsset, ...]] = {
88
+ extractable: set[tuple[WarehouseAsset, ...]] = {
90
89
  supported_assets[group] for group in groups
91
90
  }
92
91
  return list(extractable)
@@ -1,6 +1,7 @@
1
1
  import logging
2
+ from collections.abc import Iterator
2
3
  from itertools import chain
3
- from typing import Callable, Iterator, List, Optional, Tuple
4
+ from typing import Callable, Optional
4
5
 
5
6
  from ...utils import (
6
7
  OUTPUT_DIR,
@@ -16,7 +17,7 @@ from .query import AbstractQueryBuilder, ExtractionQuery
16
17
  logger = logging.getLogger(__name__)
17
18
 
18
19
 
19
- def common_args(kwargs: dict) -> Tuple[str, bool]:
20
+ def common_args(kwargs: dict) -> tuple[str, bool]:
20
21
  """Args used by all technologies"""
21
22
  output_directory = kwargs.get("output_directory") or from_env(OUTPUT_DIR)
22
23
  skip_existing = kwargs.get("skip_existing") or False
@@ -39,7 +40,7 @@ class SQLExtractionProcessor:
39
40
  self._safe_mode = safe_mode
40
41
 
41
42
  @staticmethod
42
- def _unique(data: Iterator[dict]) -> List[dict]:
43
+ def _unique(data: Iterator[dict]) -> list[dict]:
43
44
  """
44
45
  Remove duplicate in the given data.
45
46
  Remark: this method implies loading all data in memory: it breaks the streaming pipeline !
@@ -1,7 +1,7 @@
1
1
  import inspect
2
2
  import os
3
3
  from abc import ABC, abstractmethod
4
- from typing import List, Optional, Tuple
4
+ from typing import Optional
5
5
 
6
6
  from .asset import WarehouseAsset
7
7
  from .time_filter import TimeFilter
@@ -37,7 +37,7 @@ class AbstractQueryBuilder(ABC):
37
37
  def __init__(
38
38
  self,
39
39
  time_filter: Optional[TimeFilter],
40
- duplicated: Optional[Tuple[WarehouseAsset, ...]] = None,
40
+ duplicated: Optional[tuple[WarehouseAsset, ...]] = None,
41
41
  ):
42
42
  self._time_filter = time_filter or TimeFilter.default()
43
43
  self._duplicated = duplicated
@@ -55,7 +55,7 @@ class AbstractQueryBuilder(ABC):
55
55
  """read from a file located in queries directory"""
56
56
  root = os.path.dirname(inspect.getfile(self.__class__))
57
57
  path = os.path.join(root, QUERIES_DIR, filename)
58
- with open(path, "r") as f:
58
+ with open(path) as f:
59
59
  return f.read()
60
60
 
61
61
  def load_statement(self, asset: WarehouseAsset) -> str:
@@ -75,7 +75,7 @@ class AbstractQueryBuilder(ABC):
75
75
  return ExtractionQuery(statement, params)
76
76
 
77
77
  @abstractmethod
78
- def build(self, asset: WarehouseAsset) -> List[ExtractionQuery]:
78
+ def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
79
79
  """
80
80
  Build the Query allowing extraction of the given asset
81
81
  - Most of the time, returns a single query
@@ -1,6 +1,6 @@
1
1
  import itertools
2
2
  import logging
3
- from typing import List, Optional, Set
3
+ from typing import Optional
4
4
 
5
5
  from google.api_core.exceptions import Forbidden # type: ignore
6
6
  from google.cloud.bigquery import Client as GoogleCloudClient # type: ignore
@@ -27,9 +27,9 @@ class BigQueryClient(SqlalchemyClient):
27
27
  def __init__(
28
28
  self,
29
29
  credentials: dict,
30
- db_allowed: Optional[Set[str]] = None,
31
- db_blocked: Optional[Set[str]] = None,
32
- dataset_blocked: Optional[Set[str]] = None,
30
+ db_allowed: Optional[set[str]] = None,
31
+ db_blocked: Optional[set[str]] = None,
32
+ dataset_blocked: Optional[set[str]] = None,
33
33
  ):
34
34
  super().__init__(credentials)
35
35
  self._db_allowed = db_allowed
@@ -37,8 +37,8 @@ class BigQueryClient(SqlalchemyClient):
37
37
  self._dataset_blocked = dataset_blocked
38
38
  self.credentials = self._credentials()
39
39
  self.client = self._client()
40
- self._projects: List[str] | None = None
41
- self._datasets: List[Dataset] | None = None
40
+ self._projects: list[str] | None = None
41
+ self._datasets: list[Dataset] | None = None
42
42
 
43
43
  @staticmethod
44
44
  def name() -> str:
@@ -78,7 +78,7 @@ class BigQueryClient(SqlalchemyClient):
78
78
  credentials=self.credentials,
79
79
  )
80
80
 
81
- def _list_datasets(self) -> List[Dataset]:
81
+ def _list_datasets(self) -> list[Dataset]:
82
82
  """
83
83
  Returns datasets available for the given GCP client
84
84
  Cache the result in self._datasets to reduce number of API calls
@@ -98,7 +98,7 @@ class BigQueryClient(SqlalchemyClient):
98
98
  base_ms=_RETRY_BASE_MS,
99
99
  log_exc_info=True,
100
100
  )
101
- def get_projects(self) -> List[str]:
101
+ def get_projects(self) -> list[str]:
102
102
  """
103
103
  Returns distinct project_id available for the given GCP client
104
104
  Cache the result in self._projects to reduce number of API calls.
@@ -38,7 +38,7 @@ def _credentials(params: dict) -> dict:
38
38
  """extract GCP credentials"""
39
39
  path = params.get("credentials") or from_env(BIGQUERY_CREDENTIALS)
40
40
  logger.info(f"Credentials fetched from {path}")
41
- with open(path, "r") as file:
41
+ with open(path) as file:
42
42
  return cast(dict, json.load(file))
43
43
 
44
44
 
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import List, Optional
2
+ from typing import Optional
3
3
 
4
4
  from ..abstract import (
5
5
  AbstractQueryBuilder,
@@ -109,7 +109,7 @@ class BigQueryQueryBuilder(AbstractQueryBuilder):
109
109
  else self._regions
110
110
  )
111
111
 
112
- def build(self, asset: WarehouseAsset) -> List[ExtractionQuery]:
112
+ def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
113
113
  """
114
114
  It would be easier to stitch data directly in the query statement (UNION ALL).
115
115
  Unfortunately, querying INFORMATION_SCHEMA on multiple regions
@@ -1,4 +1,2 @@
1
- from typing import Set, Tuple
2
-
3
- SetString = Set[str]
4
- SetTwoString = Set[Tuple[str, str]]
1
+ SetString = set[str]
2
+ SetTwoString = set[tuple[str, str]]
@@ -1,7 +1,8 @@
1
1
  import logging
2
+ from collections.abc import Iterator
2
3
  from functools import partial
3
4
  from http import HTTPStatus
4
- from typing import Iterator, List, Optional, Set, Tuple
5
+ from typing import Optional
5
6
 
6
7
  import requests
7
8
 
@@ -55,8 +56,8 @@ class DatabricksAPIClient(APIClient):
55
56
  def __init__(
56
57
  self,
57
58
  credentials: DatabricksCredentials,
58
- db_allowed: Optional[Set[str]] = None,
59
- db_blocked: Optional[Set[str]] = None,
59
+ db_allowed: Optional[set[str]] = None,
60
+ db_blocked: Optional[set[str]] = None,
60
61
  ):
61
62
  auth = DatabricksAuth(credentials)
62
63
  super().__init__(
@@ -81,18 +82,18 @@ class DatabricksAPIClient(APIClient):
81
82
  return False
82
83
  return True
83
84
 
84
- def databases(self) -> List[dict]:
85
+ def databases(self) -> list[dict]:
85
86
  content = self._get(DatabricksEndpointFactory.databases())
86
87
  _databases = self.formatter.format_database(content.get("catalogs", []))
87
88
  return [d for d in _databases if self._keep_catalog(d["database_name"])]
88
89
 
89
- def _schemas_of_database(self, database: dict) -> List[dict]:
90
+ def _schemas_of_database(self, database: dict) -> list[dict]:
90
91
  payload = {"catalog_name": database["database_name"]}
91
92
  content = self._get(DatabricksEndpointFactory.schemas(), params=payload)
92
93
  schemas = content.get("schemas", [])
93
94
  return self.formatter.format_schema(schemas, database)
94
95
 
95
- def schemas(self, databases: List[dict]) -> List[dict]:
96
+ def schemas(self, databases: list[dict]) -> list[dict]:
96
97
  """
97
98
  Get the databricks schemas (also sometimes called databases)
98
99
  (which correspond to the schemas in Castor)
@@ -143,8 +144,8 @@ class DatabricksAPIClient(APIClient):
143
144
  )
144
145
  def get_single_column_lineage(
145
146
  self,
146
- names: Tuple[str, str],
147
- ) -> List[TimestampedLink]:
147
+ names: tuple[str, str],
148
+ ) -> list[TimestampedLink]:
148
149
  """
149
150
  Helper function used in get_lineage_links.
150
151
  Call data lineage API and return the content of the result
@@ -172,7 +173,7 @@ class DatabricksAPIClient(APIClient):
172
173
  )
173
174
  def get_single_table_lineage(
174
175
  self, table_path: str
175
- ) -> List[TimestampedLink]:
176
+ ) -> list[TimestampedLink]:
176
177
  """
177
178
  Helper function used in get_lineage_links.
178
179
  Call data lineage API and return the content of the result
@@ -210,7 +211,7 @@ class DatabricksAPIClient(APIClient):
210
211
  queries = fetch_all_pages(request, DatabricksPagination)
211
212
  return queries
212
213
 
213
- def queries(self, time_filter: Optional[TimeFilter] = None) -> List[dict]:
214
+ def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
214
215
  """get all queries, hour per hour"""
215
216
  time_range_filters = hourly_time_filters(time_filter)
216
217
  raw_queries = []
@@ -220,14 +221,14 @@ class DatabricksAPIClient(APIClient):
220
221
  raw_queries.extend(hourly)
221
222
  return self.formatter.format_query(raw_queries)
222
223
 
223
- def users(self) -> List[dict]:
224
+ def users(self) -> list[dict]:
224
225
  """
225
226
  retrieve user from api
226
227
  """
227
228
  content = self._get(DatabricksEndpointFactory.users())
228
229
  return self.formatter.format_user(content.get("Resources", []))
229
230
 
230
- def _view_ddl_per_schema(self, schema: dict) -> List[dict]:
231
+ def _view_ddl_per_schema(self, schema: dict) -> list[dict]:
231
232
  payload = {
232
233
  "catalog_name": schema["database_id"],
233
234
  "schema_name": schema["schema_name"],
@@ -236,9 +237,9 @@ class DatabricksAPIClient(APIClient):
236
237
  content = self._get(DatabricksEndpointFactory.tables(), params=payload)
237
238
  return self.formatter.format_view_ddl(content.get("tables", []), schema)
238
239
 
239
- def view_ddl(self, schemas: List[dict]) -> List[dict]:
240
+ def view_ddl(self, schemas: list[dict]) -> list[dict]:
240
241
  """retrieve view ddl"""
241
- view_ddl: List[dict] = []
242
+ view_ddl: list[dict] = []
242
243
  for schema in schemas:
243
244
  v_to_add = self._view_ddl_per_schema(schema)
244
245
  view_ddl.extend(v_to_add)
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from concurrent.futures import ThreadPoolExecutor
3
- from typing import List, Optional, Set
3
+ from typing import Optional
4
4
 
5
5
  from ...utils import (
6
6
  mapping_from_rows,
@@ -25,8 +25,8 @@ class DatabricksClient:
25
25
  def __init__(
26
26
  self,
27
27
  credentials: DatabricksCredentials,
28
- db_allowed: Optional[Set[str]] = None,
29
- db_blocked: Optional[Set[str]] = None,
28
+ db_allowed: Optional[set[str]] = None,
29
+ db_blocked: Optional[set[str]] = None,
30
30
  has_table_tags: bool = False,
31
31
  has_column_tags: bool = False,
32
32
  ):
@@ -58,26 +58,26 @@ class DatabricksClient:
58
58
  return {**table, "owner_external_id": owner_external_id}
59
59
 
60
60
  @staticmethod
61
- def _get_user_mapping(users: List[dict]) -> dict:
61
+ def _get_user_mapping(users: list[dict]) -> dict:
62
62
  return {
63
63
  **mapping_from_rows(users, "email", "id"),
64
64
  **mapping_from_rows(users, "user_name", "id"),
65
65
  }
66
66
 
67
- def schemas(self, databases: List[dict]) -> List[dict]:
67
+ def schemas(self, databases: list[dict]) -> list[dict]:
68
68
  return self.api_client.schemas(databases)
69
69
 
70
- def databases(self) -> List[dict]:
70
+ def databases(self) -> list[dict]:
71
71
  return self.api_client.databases()
72
72
 
73
73
  def tables_and_columns(
74
- self, schemas: List[dict], users: List[dict]
74
+ self, schemas: list[dict], users: list[dict]
75
75
  ) -> TablesColumns:
76
76
  """
77
77
  Get the databricks tables & columns leveraging the unity catalog API
78
78
  """
79
- tables: List[dict] = []
80
- columns: List[dict] = []
79
+ tables: list[dict] = []
80
+ columns: list[dict] = []
81
81
  user_mapping = self._get_user_mapping(users)
82
82
  table_tags = self.sql_client.get_tags_mapping(TagEntity.TABLE)
83
83
  column_tags = self.sql_client.get_tags_mapping(TagEntity.COLUMN)
@@ -95,7 +95,7 @@ class DatabricksClient:
95
95
  columns.extend(c_to_add)
96
96
  return tables, columns
97
97
 
98
- def table_lineage(self, tables: List[dict]) -> List[dict]:
98
+ def table_lineage(self, tables: list[dict]) -> list[dict]:
99
99
  """
100
100
  Wrapper function that retrieves all table lineage
101
101
  """
@@ -113,8 +113,8 @@ class DatabricksClient:
113
113
  return self.formatter.format_lineage(deduplicated)
114
114
 
115
115
  def column_lineage(
116
- self, tables: List[dict], columns: List[dict], table_lineage: List[dict]
117
- ) -> List[dict]:
116
+ self, tables: list[dict], columns: list[dict], table_lineage: list[dict]
117
+ ) -> list[dict]:
118
118
  """
119
119
  Wrapper function that retrieves all column lineage
120
120
  we only try to retrieve column lineage if we found table lineage
@@ -129,17 +129,17 @@ class DatabricksClient:
129
129
  results = executor.map(
130
130
  self.api_client.get_single_column_lineage, candidate_paths
131
131
  )
132
- lineages: List[TimestampedLink] = [
132
+ lineages: list[TimestampedLink] = [
133
133
  link for links in results for link in links
134
134
  ]
135
135
  deduplicated = deduplicate_lineage(lineages)
136
136
  return self.formatter.format_lineage(deduplicated)
137
137
 
138
- def queries(self, time_filter: Optional[TimeFilter] = None) -> List[dict]:
138
+ def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
139
139
  return self.api_client.queries(time_filter)
140
140
 
141
- def users(self) -> List[dict]:
141
+ def users(self) -> list[dict]:
142
142
  return self.api_client.users()
143
143
 
144
- def view_ddl(self, schemas: List[dict]) -> List[dict]:
144
+ def view_ddl(self, schemas: list[dict]) -> list[dict]:
145
145
  return self.api_client.view_ddl(schemas)