castor-extractor 0.21.7__py3-none-any.whl → 0.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of castor-extractor might be problematic. Click here for more details.

Files changed (131) hide show
  1. CHANGELOG.md +8 -0
  2. castor_extractor/commands/__init__.py +0 -3
  3. castor_extractor/commands/file_check.py +1 -2
  4. castor_extractor/file_checker/column.py +5 -5
  5. castor_extractor/file_checker/file.py +7 -7
  6. castor_extractor/file_checker/file_test.py +2 -2
  7. castor_extractor/file_checker/templates/generic_warehouse.py +4 -6
  8. castor_extractor/knowledge/confluence/client/client.py +2 -1
  9. castor_extractor/knowledge/confluence/extract.py +3 -2
  10. castor_extractor/knowledge/notion/client/client.py +3 -2
  11. castor_extractor/knowledge/notion/extract.py +3 -2
  12. castor_extractor/quality/soda/client/client.py +2 -1
  13. castor_extractor/quality/soda/client/pagination.py +1 -3
  14. castor_extractor/types.py +3 -3
  15. castor_extractor/uploader/env.py +2 -2
  16. castor_extractor/uploader/upload.py +4 -3
  17. castor_extractor/uploader/utils.py +1 -1
  18. castor_extractor/utils/client/abstract.py +2 -1
  19. castor_extractor/utils/client/api/auth.py +2 -2
  20. castor_extractor/utils/client/api/auth_test.py +2 -2
  21. castor_extractor/utils/client/api/client.py +8 -3
  22. castor_extractor/utils/client/api/pagination.py +3 -2
  23. castor_extractor/utils/client/api/safe_request.py +5 -5
  24. castor_extractor/utils/collection.py +7 -11
  25. castor_extractor/utils/dbt/client.py +3 -3
  26. castor_extractor/utils/dbt/client_test.py +2 -2
  27. castor_extractor/utils/deprecate.py +1 -2
  28. castor_extractor/utils/files.py +5 -5
  29. castor_extractor/utils/formatter.py +5 -4
  30. castor_extractor/utils/json_stream_write.py +2 -1
  31. castor_extractor/utils/object.py +2 -1
  32. castor_extractor/utils/pager/pager.py +2 -4
  33. castor_extractor/utils/pager/pager_on_id.py +2 -1
  34. castor_extractor/utils/pager/pager_on_id_test.py +5 -5
  35. castor_extractor/utils/pager/pager_test.py +3 -3
  36. castor_extractor/utils/retry.py +4 -3
  37. castor_extractor/utils/retry_test.py +2 -3
  38. castor_extractor/utils/safe.py +3 -3
  39. castor_extractor/utils/salesforce/client.py +2 -1
  40. castor_extractor/utils/salesforce/credentials.py +1 -3
  41. castor_extractor/utils/store.py +2 -1
  42. castor_extractor/utils/string.py +2 -2
  43. castor_extractor/utils/string_test.py +1 -3
  44. castor_extractor/utils/type.py +3 -2
  45. castor_extractor/utils/validation.py +4 -4
  46. castor_extractor/utils/write.py +2 -2
  47. castor_extractor/visualization/domo/client/client.py +8 -7
  48. castor_extractor/visualization/domo/client/credentials.py +2 -2
  49. castor_extractor/visualization/domo/client/endpoints.py +2 -2
  50. castor_extractor/visualization/domo/extract.py +3 -2
  51. castor_extractor/visualization/looker/api/client.py +17 -16
  52. castor_extractor/visualization/looker/api/utils.py +2 -2
  53. castor_extractor/visualization/looker/assets.py +1 -3
  54. castor_extractor/visualization/looker/extract.py +4 -3
  55. castor_extractor/visualization/looker/fields.py +3 -3
  56. castor_extractor/visualization/looker/multithreading.py +3 -3
  57. castor_extractor/visualization/metabase/assets.py +1 -3
  58. castor_extractor/visualization/metabase/client/api/client.py +8 -7
  59. castor_extractor/visualization/metabase/extract.py +3 -2
  60. castor_extractor/visualization/metabase/types.py +1 -3
  61. castor_extractor/visualization/mode/client/client.py +6 -6
  62. castor_extractor/visualization/mode/extract.py +2 -2
  63. castor_extractor/visualization/powerbi/assets.py +1 -3
  64. castor_extractor/visualization/powerbi/client/client.py +12 -11
  65. castor_extractor/visualization/powerbi/client/credentials.py +3 -3
  66. castor_extractor/visualization/powerbi/client/endpoints.py +2 -2
  67. castor_extractor/visualization/powerbi/extract.py +3 -2
  68. castor_extractor/visualization/qlik/assets.py +1 -3
  69. castor_extractor/visualization/qlik/client/constants.py +1 -3
  70. castor_extractor/visualization/qlik/client/engine/error.py +1 -3
  71. castor_extractor/visualization/qlik/client/master.py +3 -3
  72. castor_extractor/visualization/qlik/client/rest.py +12 -12
  73. castor_extractor/visualization/qlik/extract.py +4 -3
  74. castor_extractor/visualization/salesforce_reporting/client/rest.py +3 -2
  75. castor_extractor/visualization/salesforce_reporting/client/soql.py +1 -3
  76. castor_extractor/visualization/salesforce_reporting/extract.py +3 -2
  77. castor_extractor/visualization/sigma/client/client.py +11 -8
  78. castor_extractor/visualization/sigma/client/credentials.py +1 -3
  79. castor_extractor/visualization/sigma/client/pagination.py +1 -1
  80. castor_extractor/visualization/sigma/extract.py +3 -2
  81. castor_extractor/visualization/tableau/assets.py +1 -2
  82. castor_extractor/visualization/tableau/client/client.py +1 -2
  83. castor_extractor/visualization/tableau/client/client_utils.py +3 -2
  84. castor_extractor/visualization/tableau/client/credentials.py +3 -3
  85. castor_extractor/visualization/tableau/client/safe_mode.py +1 -2
  86. castor_extractor/visualization/tableau/extract.py +2 -2
  87. castor_extractor/visualization/tableau/gql_fields.py +3 -3
  88. castor_extractor/visualization/tableau/tsc_fields.py +1 -2
  89. castor_extractor/visualization/tableau/types.py +3 -3
  90. castor_extractor/visualization/tableau_revamp/client/client.py +6 -1
  91. castor_extractor/visualization/tableau_revamp/client/client_metadata_api.py +56 -9
  92. castor_extractor/visualization/tableau_revamp/client/client_rest_api.py +3 -3
  93. castor_extractor/visualization/tableau_revamp/client/client_tsc.py +3 -2
  94. castor_extractor/visualization/tableau_revamp/client/errors.py +5 -0
  95. castor_extractor/visualization/tableau_revamp/client/gql_queries.py +1 -3
  96. castor_extractor/visualization/tableau_revamp/client/rest_fields.py +1 -3
  97. castor_extractor/visualization/tableau_revamp/extract.py +2 -2
  98. castor_extractor/visualization/thoughtspot/client/client.py +3 -2
  99. castor_extractor/visualization/thoughtspot/client/utils.py +1 -1
  100. castor_extractor/visualization/thoughtspot/extract.py +3 -2
  101. castor_extractor/warehouse/abstract/asset.py +4 -5
  102. castor_extractor/warehouse/abstract/extract.py +4 -3
  103. castor_extractor/warehouse/abstract/query.py +4 -4
  104. castor_extractor/warehouse/bigquery/client.py +8 -8
  105. castor_extractor/warehouse/bigquery/extract.py +1 -1
  106. castor_extractor/warehouse/bigquery/query.py +2 -2
  107. castor_extractor/warehouse/bigquery/types.py +2 -4
  108. castor_extractor/warehouse/databricks/api_client.py +15 -14
  109. castor_extractor/warehouse/databricks/client.py +16 -16
  110. castor_extractor/warehouse/databricks/extract.py +4 -4
  111. castor_extractor/warehouse/databricks/format.py +12 -12
  112. castor_extractor/warehouse/databricks/lineage.py +11 -11
  113. castor_extractor/warehouse/databricks/pagination.py +2 -2
  114. castor_extractor/warehouse/databricks/types.py +4 -4
  115. castor_extractor/warehouse/databricks/utils.py +5 -4
  116. castor_extractor/warehouse/mysql/query.py +2 -2
  117. castor_extractor/warehouse/postgres/query.py +2 -2
  118. castor_extractor/warehouse/redshift/client.py +1 -1
  119. castor_extractor/warehouse/redshift/query.py +2 -2
  120. castor_extractor/warehouse/salesforce/client.py +8 -8
  121. castor_extractor/warehouse/salesforce/extract.py +3 -4
  122. castor_extractor/warehouse/salesforce/format.py +8 -7
  123. castor_extractor/warehouse/salesforce/format_test.py +2 -4
  124. castor_extractor/warehouse/snowflake/query.py +5 -5
  125. castor_extractor/warehouse/sqlserver/client.py +1 -1
  126. castor_extractor/warehouse/sqlserver/query.py +2 -2
  127. {castor_extractor-0.21.7.dist-info → castor_extractor-0.22.0.dist-info}/METADATA +11 -6
  128. {castor_extractor-0.21.7.dist-info → castor_extractor-0.22.0.dist-info}/RECORD +131 -131
  129. {castor_extractor-0.21.7.dist-info → castor_extractor-0.22.0.dist-info}/LICENCE +0 -0
  130. {castor_extractor-0.21.7.dist-info → castor_extractor-0.22.0.dist-info}/WHEEL +0 -0
  131. {castor_extractor-0.21.7.dist-info → castor_extractor-0.22.0.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,8 @@
1
1
  import logging
2
+ from collections.abc import Iterator
2
3
  from functools import partial
3
4
  from http import HTTPStatus
4
- from typing import Iterator, List, Optional, Set, Tuple
5
+ from typing import Optional
5
6
 
6
7
  import requests
7
8
 
@@ -55,8 +56,8 @@ class DatabricksAPIClient(APIClient):
55
56
  def __init__(
56
57
  self,
57
58
  credentials: DatabricksCredentials,
58
- db_allowed: Optional[Set[str]] = None,
59
- db_blocked: Optional[Set[str]] = None,
59
+ db_allowed: Optional[set[str]] = None,
60
+ db_blocked: Optional[set[str]] = None,
60
61
  ):
61
62
  auth = DatabricksAuth(credentials)
62
63
  super().__init__(
@@ -81,18 +82,18 @@ class DatabricksAPIClient(APIClient):
81
82
  return False
82
83
  return True
83
84
 
84
- def databases(self) -> List[dict]:
85
+ def databases(self) -> list[dict]:
85
86
  content = self._get(DatabricksEndpointFactory.databases())
86
87
  _databases = self.formatter.format_database(content.get("catalogs", []))
87
88
  return [d for d in _databases if self._keep_catalog(d["database_name"])]
88
89
 
89
- def _schemas_of_database(self, database: dict) -> List[dict]:
90
+ def _schemas_of_database(self, database: dict) -> list[dict]:
90
91
  payload = {"catalog_name": database["database_name"]}
91
92
  content = self._get(DatabricksEndpointFactory.schemas(), params=payload)
92
93
  schemas = content.get("schemas", [])
93
94
  return self.formatter.format_schema(schemas, database)
94
95
 
95
- def schemas(self, databases: List[dict]) -> List[dict]:
96
+ def schemas(self, databases: list[dict]) -> list[dict]:
96
97
  """
97
98
  Get the databricks schemas (also sometimes called databases)
98
99
  (which correspond to the schemas in Castor)
@@ -143,8 +144,8 @@ class DatabricksAPIClient(APIClient):
143
144
  )
144
145
  def get_single_column_lineage(
145
146
  self,
146
- names: Tuple[str, str],
147
- ) -> List[TimestampedLink]:
147
+ names: tuple[str, str],
148
+ ) -> list[TimestampedLink]:
148
149
  """
149
150
  Helper function used in get_lineage_links.
150
151
  Call data lineage API and return the content of the result
@@ -172,7 +173,7 @@ class DatabricksAPIClient(APIClient):
172
173
  )
173
174
  def get_single_table_lineage(
174
175
  self, table_path: str
175
- ) -> List[TimestampedLink]:
176
+ ) -> list[TimestampedLink]:
176
177
  """
177
178
  Helper function used in get_lineage_links.
178
179
  Call data lineage API and return the content of the result
@@ -210,7 +211,7 @@ class DatabricksAPIClient(APIClient):
210
211
  queries = fetch_all_pages(request, DatabricksPagination)
211
212
  return queries
212
213
 
213
- def queries(self, time_filter: Optional[TimeFilter] = None) -> List[dict]:
214
+ def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
214
215
  """get all queries, hour per hour"""
215
216
  time_range_filters = hourly_time_filters(time_filter)
216
217
  raw_queries = []
@@ -220,14 +221,14 @@ class DatabricksAPIClient(APIClient):
220
221
  raw_queries.extend(hourly)
221
222
  return self.formatter.format_query(raw_queries)
222
223
 
223
- def users(self) -> List[dict]:
224
+ def users(self) -> list[dict]:
224
225
  """
225
226
  retrieve user from api
226
227
  """
227
228
  content = self._get(DatabricksEndpointFactory.users())
228
229
  return self.formatter.format_user(content.get("Resources", []))
229
230
 
230
- def _view_ddl_per_schema(self, schema: dict) -> List[dict]:
231
+ def _view_ddl_per_schema(self, schema: dict) -> list[dict]:
231
232
  payload = {
232
233
  "catalog_name": schema["database_id"],
233
234
  "schema_name": schema["schema_name"],
@@ -236,9 +237,9 @@ class DatabricksAPIClient(APIClient):
236
237
  content = self._get(DatabricksEndpointFactory.tables(), params=payload)
237
238
  return self.formatter.format_view_ddl(content.get("tables", []), schema)
238
239
 
239
- def view_ddl(self, schemas: List[dict]) -> List[dict]:
240
+ def view_ddl(self, schemas: list[dict]) -> list[dict]:
240
241
  """retrieve view ddl"""
241
- view_ddl: List[dict] = []
242
+ view_ddl: list[dict] = []
242
243
  for schema in schemas:
243
244
  v_to_add = self._view_ddl_per_schema(schema)
244
245
  view_ddl.extend(v_to_add)
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from concurrent.futures import ThreadPoolExecutor
3
- from typing import List, Optional, Set
3
+ from typing import Optional
4
4
 
5
5
  from ...utils import (
6
6
  mapping_from_rows,
@@ -25,8 +25,8 @@ class DatabricksClient:
25
25
  def __init__(
26
26
  self,
27
27
  credentials: DatabricksCredentials,
28
- db_allowed: Optional[Set[str]] = None,
29
- db_blocked: Optional[Set[str]] = None,
28
+ db_allowed: Optional[set[str]] = None,
29
+ db_blocked: Optional[set[str]] = None,
30
30
  has_table_tags: bool = False,
31
31
  has_column_tags: bool = False,
32
32
  ):
@@ -58,26 +58,26 @@ class DatabricksClient:
58
58
  return {**table, "owner_external_id": owner_external_id}
59
59
 
60
60
  @staticmethod
61
- def _get_user_mapping(users: List[dict]) -> dict:
61
+ def _get_user_mapping(users: list[dict]) -> dict:
62
62
  return {
63
63
  **mapping_from_rows(users, "email", "id"),
64
64
  **mapping_from_rows(users, "user_name", "id"),
65
65
  }
66
66
 
67
- def schemas(self, databases: List[dict]) -> List[dict]:
67
+ def schemas(self, databases: list[dict]) -> list[dict]:
68
68
  return self.api_client.schemas(databases)
69
69
 
70
- def databases(self) -> List[dict]:
70
+ def databases(self) -> list[dict]:
71
71
  return self.api_client.databases()
72
72
 
73
73
  def tables_and_columns(
74
- self, schemas: List[dict], users: List[dict]
74
+ self, schemas: list[dict], users: list[dict]
75
75
  ) -> TablesColumns:
76
76
  """
77
77
  Get the databricks tables & columns leveraging the unity catalog API
78
78
  """
79
- tables: List[dict] = []
80
- columns: List[dict] = []
79
+ tables: list[dict] = []
80
+ columns: list[dict] = []
81
81
  user_mapping = self._get_user_mapping(users)
82
82
  table_tags = self.sql_client.get_tags_mapping(TagEntity.TABLE)
83
83
  column_tags = self.sql_client.get_tags_mapping(TagEntity.COLUMN)
@@ -95,7 +95,7 @@ class DatabricksClient:
95
95
  columns.extend(c_to_add)
96
96
  return tables, columns
97
97
 
98
- def table_lineage(self, tables: List[dict]) -> List[dict]:
98
+ def table_lineage(self, tables: list[dict]) -> list[dict]:
99
99
  """
100
100
  Wrapper function that retrieves all table lineage
101
101
  """
@@ -113,8 +113,8 @@ class DatabricksClient:
113
113
  return self.formatter.format_lineage(deduplicated)
114
114
 
115
115
  def column_lineage(
116
- self, tables: List[dict], columns: List[dict], table_lineage: List[dict]
117
- ) -> List[dict]:
116
+ self, tables: list[dict], columns: list[dict], table_lineage: list[dict]
117
+ ) -> list[dict]:
118
118
  """
119
119
  Wrapper function that retrieves all column lineage
120
120
  we only try to retrieve column lineage if we found table lineage
@@ -129,17 +129,17 @@ class DatabricksClient:
129
129
  results = executor.map(
130
130
  self.api_client.get_single_column_lineage, candidate_paths
131
131
  )
132
- lineages: List[TimestampedLink] = [
132
+ lineages: list[TimestampedLink] = [
133
133
  link for links in results for link in links
134
134
  ]
135
135
  deduplicated = deduplicate_lineage(lineages)
136
136
  return self.formatter.format_lineage(deduplicated)
137
137
 
138
- def queries(self, time_filter: Optional[TimeFilter] = None) -> List[dict]:
138
+ def queries(self, time_filter: Optional[TimeFilter] = None) -> list[dict]:
139
139
  return self.api_client.queries(time_filter)
140
140
 
141
- def users(self) -> List[dict]:
141
+ def users(self) -> list[dict]:
142
142
  return self.api_client.users()
143
143
 
144
- def view_ddl(self, schemas: List[dict]) -> List[dict]:
144
+ def view_ddl(self, schemas: list[dict]) -> list[dict]:
145
145
  return self.api_client.view_ddl(schemas)
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Dict, Optional
2
+ from typing import Optional
3
3
 
4
4
  from ...utils import AbstractStorage, LocalStorage, write_summary
5
5
  from ..abstract import (
@@ -29,7 +29,7 @@ DATABRICKS_ASSETS: SupportedAssets = {
29
29
  logger = logging.getLogger(__name__)
30
30
 
31
31
  OTimeFilter = Optional[TimeFilter]
32
- Paths = Dict[str, str]
32
+ Paths = dict[str, str]
33
33
 
34
34
 
35
35
  class DatabricksExtractionProcessor:
@@ -71,7 +71,7 @@ class DatabricksExtractionProcessor:
71
71
  if self._should_not_reextract(WarehouseAssetGroup.CATALOG):
72
72
  return self._existing_group_paths(WarehouseAssetGroup.CATALOG)
73
73
 
74
- catalog_locations: Dict[str, str] = dict()
74
+ catalog_locations: dict[str, str] = dict()
75
75
  databases = self._client.databases()
76
76
  location = self._storage.put(WarehouseAsset.DATABASE.value, databases)
77
77
  catalog_locations[WarehouseAsset.DATABASE.value] = location
@@ -101,7 +101,7 @@ class DatabricksExtractionProcessor:
101
101
  return self._existing_group_paths(
102
102
  WarehouseAssetGroup.ADDITIONAL_LINEAGE
103
103
  )
104
- lineage_locations: Dict[str, str] = dict()
104
+ lineage_locations: dict[str, str] = dict()
105
105
 
106
106
  # extract catalog
107
107
  databases = self._client.databases()
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from datetime import datetime
3
- from typing import Dict, List, Optional
3
+ from typing import Optional
4
4
 
5
5
  from .types import TablesColumns
6
6
  from .utils import build_path
@@ -12,7 +12,7 @@ EXCLUDED_SCHEMAS = {"information_schema", "default"}
12
12
 
13
13
  TABLE_URL_TPL = "{host}explore/data/{catalog_name}/{schema_name}/{table_name}?o={workspace_id}"
14
14
 
15
- TagMapping = Dict[str, List[str]]
15
+ TagMapping = dict[str, list[str]]
16
16
 
17
17
 
18
18
  def _to_datetime_or_none(time_ms: Optional[int]) -> Optional[datetime]:
@@ -87,7 +87,7 @@ class DatabricksFormatter:
87
87
  """
88
88
 
89
89
  @staticmethod
90
- def format_database(raw_databases: List[dict]) -> List[dict]:
90
+ def format_database(raw_databases: list[dict]) -> list[dict]:
91
91
  databases = []
92
92
  for catalog in raw_databases:
93
93
  name = catalog["name"]
@@ -101,7 +101,7 @@ class DatabricksFormatter:
101
101
  return databases
102
102
 
103
103
  @staticmethod
104
- def format_schema(raw_schemas: List[dict], database: dict) -> List[dict]:
104
+ def format_schema(raw_schemas: list[dict], database: dict) -> list[dict]:
105
105
  schemas = []
106
106
  for schema in raw_schemas:
107
107
  if schema["name"] in EXCLUDED_SCHEMAS:
@@ -118,7 +118,7 @@ class DatabricksFormatter:
118
118
 
119
119
  @staticmethod
120
120
  def format_table_column(
121
- raw_tables: List[dict],
121
+ raw_tables: list[dict],
122
122
  schema: dict,
123
123
  host: str,
124
124
  workspace_id: str,
@@ -141,8 +141,8 @@ class DatabricksFormatter:
141
141
  return tables, columns
142
142
 
143
143
  @staticmethod
144
- def format_lineage(timestamps: dict) -> List[dict]:
145
- lineage: List[dict] = []
144
+ def format_lineage(timestamps: dict) -> list[dict]:
145
+ lineage: list[dict] = []
146
146
  for link, timestamp in timestamps.items():
147
147
  parent_path, child_path = link
148
148
  link_ = {
@@ -154,7 +154,7 @@ class DatabricksFormatter:
154
154
  return lineage
155
155
 
156
156
  @staticmethod
157
- def format_query(raw_queries: List[dict]) -> List[dict]:
157
+ def format_query(raw_queries: list[dict]) -> list[dict]:
158
158
  queries = []
159
159
  for q in raw_queries:
160
160
  if not q["query_text"]:
@@ -176,7 +176,7 @@ class DatabricksFormatter:
176
176
  return queries
177
177
 
178
178
  @staticmethod
179
- def _primary(emails: List[dict]) -> Optional[str]:
179
+ def _primary(emails: list[dict]) -> Optional[str]:
180
180
  """helper function to select a unique email"""
181
181
  if not emails:
182
182
  return None
@@ -189,7 +189,7 @@ class DatabricksFormatter:
189
189
  emails = user.get("emails")
190
190
  return self._primary(emails) if emails else None
191
191
 
192
- def format_user(self, raw_users: List[dict]) -> List[dict]:
192
+ def format_user(self, raw_users: list[dict]) -> list[dict]:
193
193
  users = []
194
194
  for user in raw_users:
195
195
  users.append(
@@ -204,8 +204,8 @@ class DatabricksFormatter:
204
204
  return users
205
205
 
206
206
  @staticmethod
207
- def format_view_ddl(tables: List[dict], schema: dict) -> List[dict]:
208
- view_ddl: List[dict] = []
207
+ def format_view_ddl(tables: list[dict], schema: dict) -> list[dict]:
208
+ view_ddl: list[dict] = []
209
209
  if not tables:
210
210
  return view_ddl
211
211
  for table in tables:
@@ -1,4 +1,4 @@
1
- from typing import Dict, List, Set, Tuple, cast
1
+ from typing import cast
2
2
 
3
3
  from .types import Link, Ostr, OTimestampedLink, TimestampedLink
4
4
 
@@ -9,7 +9,7 @@ class LineageLinks:
9
9
  """
10
10
 
11
11
  def __init__(self):
12
- self.lineage: Dict[Link, Ostr] = dict()
12
+ self.lineage: dict[Link, Ostr] = dict()
13
13
 
14
14
  def add(self, timestamped_link: TimestampedLink) -> None:
15
15
  """
@@ -52,7 +52,7 @@ def _link(path_from: Ostr, path_to: Ostr, timestamp: Ostr) -> OTimestampedLink:
52
52
 
53
53
  def single_table_lineage_links(
54
54
  table_path: str, single_table_lineage: dict
55
- ) -> List[TimestampedLink]:
55
+ ) -> list[TimestampedLink]:
56
56
  """
57
57
  process databricks lineage API response for a given table
58
58
  returns a list of (parent, child, timestamp)
@@ -60,7 +60,7 @@ def single_table_lineage_links(
60
60
  Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
61
61
  we could also have `notebookInfos` or `fileInfo`
62
62
  """
63
- links: List[OTimestampedLink] = []
63
+ links: list[OTimestampedLink] = []
64
64
  # add parent:
65
65
  for link in single_table_lineage.get("upstreams", []):
66
66
  parent = link.get("tableInfo", {})
@@ -80,7 +80,7 @@ def single_table_lineage_links(
80
80
 
81
81
  def single_column_lineage_links(
82
82
  column_path: str, single_column_lineage: dict
83
- ) -> List[TimestampedLink]:
83
+ ) -> list[TimestampedLink]:
84
84
  """
85
85
  process databricks lineage API response for a given table
86
86
  returns a list of (parent, child, timestamp)
@@ -88,7 +88,7 @@ def single_column_lineage_links(
88
88
  Note: in `upstreams` or `downstreams` we only care about `tableInfo`,
89
89
  we could also have `notebookInfos` or `fileInfo`
90
90
  """
91
- links: List[OTimestampedLink] = []
91
+ links: list[OTimestampedLink] = []
92
92
  # add parent:
93
93
  for link in single_column_lineage.get("upstream_cols", []):
94
94
  parent_path = _to_column_path(link)
@@ -105,8 +105,8 @@ def single_column_lineage_links(
105
105
 
106
106
 
107
107
  def paths_for_column_lineage(
108
- tables: List[dict], columns: List[dict], table_lineage: List[dict]
109
- ) -> List[Tuple[str, str]]:
108
+ tables: list[dict], columns: list[dict], table_lineage: list[dict]
109
+ ) -> list[tuple[str, str]]:
110
110
  """
111
111
  helper providing a list of candidate columns to look lineage for:
112
112
  we only look for column lineage where there is table lineage
@@ -118,12 +118,12 @@ def paths_for_column_lineage(
118
118
  for table in tables
119
119
  }
120
120
 
121
- tables_with_lineage: Set[str] = set()
121
+ tables_with_lineage: set[str] = set()
122
122
  for t in table_lineage:
123
123
  tables_with_lineage.add(t["parent_path"])
124
124
  tables_with_lineage.add(t["child_path"])
125
125
 
126
- paths_to_return: List[Tuple[str, str]] = []
126
+ paths_to_return: list[tuple[str, str]] = []
127
127
  for column in columns:
128
128
  table_path = mapping[column["table_id"]]
129
129
  if table_path not in tables_with_lineage:
@@ -134,7 +134,7 @@ def paths_for_column_lineage(
134
134
  return paths_to_return
135
135
 
136
136
 
137
- def deduplicate_lineage(lineages: List[TimestampedLink]) -> dict:
137
+ def deduplicate_lineage(lineages: list[TimestampedLink]) -> dict:
138
138
  deduplicated_lineage = LineageLinks()
139
139
  for timestamped_link in lineages:
140
140
  deduplicated_lineage.add(timestamped_link)
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import Optional
2
2
 
3
3
  from pydantic import Field
4
4
 
@@ -10,7 +10,7 @@ DATABRICKS_PAGE_SIZE = 100
10
10
  class DatabricksPagination(PaginationModel):
11
11
  next_page_token: Optional[str] = None
12
12
  has_next_page: bool = False
13
- res: List[dict] = Field(default_factory=list)
13
+ res: list[dict] = Field(default_factory=list)
14
14
 
15
15
  def is_last(self) -> bool:
16
16
  return not (self.has_next_page and self.next_page_token)
@@ -1,8 +1,8 @@
1
- from typing import List, Optional, Tuple
1
+ from typing import Optional
2
2
 
3
- Link = Tuple[str, str]
4
- TablesColumns = Tuple[List[dict], List[dict]]
3
+ Link = tuple[str, str]
4
+ TablesColumns = tuple[list[dict], list[dict]]
5
5
  Ostr = Optional[str]
6
- TimestampedLink = Tuple[str, str, Ostr]
6
+ TimestampedLink = tuple[str, str, Ostr]
7
7
 
8
8
  OTimestampedLink = Optional[TimestampedLink]
@@ -1,5 +1,6 @@
1
+ from collections.abc import Iterable
1
2
  from datetime import date
2
- from typing import Dict, Iterable, List, Optional
3
+ from typing import Optional
3
4
 
4
5
  from ...utils import at_midnight
5
6
  from ..abstract import TimeFilter
@@ -14,8 +15,8 @@ def _day_hour_to_epoch_ms(day: date, hour: int) -> int:
14
15
 
15
16
 
16
17
  def build_path(
17
- row: Dict,
18
- keys: List[str],
18
+ row: dict,
19
+ keys: list[str],
19
20
  ) -> str:
20
21
  """
21
22
  format an asset's path:
@@ -26,7 +27,7 @@ def build_path(
26
27
  return ".".join(key_values)
27
28
 
28
29
 
29
- def tag_label(row: Dict) -> str:
30
+ def tag_label(row: dict) -> str:
30
31
  """
31
32
  format the tag's label:
32
33
  - {key:value} when the value is not empty
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import Optional
2
2
 
3
3
  from ..abstract import (
4
4
  AbstractQueryBuilder,
@@ -19,6 +19,6 @@ class MySQLQueryBuilder(AbstractQueryBuilder):
19
19
  ):
20
20
  super().__init__(time_filter=time_filter)
21
21
 
22
- def build(self, asset: WarehouseAsset) -> List[ExtractionQuery]:
22
+ def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
23
23
  query = self.build_default(asset)
24
24
  return [query]
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import Optional
2
2
 
3
3
  from ..abstract import (
4
4
  AbstractQueryBuilder,
@@ -19,6 +19,6 @@ class PostgresQueryBuilder(AbstractQueryBuilder):
19
19
  ):
20
20
  super().__init__(time_filter=time_filter)
21
21
 
22
- def build(self, asset: WarehouseAsset) -> List[ExtractionQuery]:
22
+ def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
23
23
  query = self.build_default(asset)
24
24
  return [query]
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Iterator
2
+ from collections.abc import Iterator
3
3
 
4
4
  from psycopg2 import extensions # type: ignore
5
5
  from sqlalchemy.engine import Connection, ResultProxy
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import Optional
2
2
 
3
3
  from ..abstract import (
4
4
  AbstractQueryBuilder,
@@ -27,7 +27,7 @@ class RedshiftQueryBuilder(AbstractQueryBuilder):
27
27
  params = self._time_filter.to_dict()
28
28
  return ExtractionQuery(statement, params)
29
29
 
30
- def build(self, asset: WarehouseAsset) -> List[ExtractionQuery]:
30
+ def build(self, asset: WarehouseAsset) -> list[ExtractionQuery]:
31
31
  if asset == WarehouseAsset.QUERY and self.is_serverless:
32
32
  query = self.build_query_serverless()
33
33
  else:
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from functools import partial
3
- from typing import Dict, List, Optional, Tuple
3
+ from typing import Optional
4
4
 
5
5
  from tqdm import tqdm # type: ignore
6
6
 
@@ -29,7 +29,7 @@ class SalesforceClient(SalesforceBaseClient):
29
29
  def name() -> str:
30
30
  return "Salesforce"
31
31
 
32
- def fetch_sobjects(self) -> List[dict]:
32
+ def fetch_sobjects(self) -> list[dict]:
33
33
  """Fetch all sobjects"""
34
34
  logger.info("Extracting sobjects")
35
35
  query = format_sobject_query()
@@ -39,7 +39,7 @@ class SalesforceClient(SalesforceBaseClient):
39
39
  results = fetch_all_pages(request_, SalesforceSQLPagination)
40
40
  return list(results)
41
41
 
42
- def fetch_fields(self, sobject_name: str) -> List[dict]:
42
+ def fetch_fields(self, sobject_name: str) -> list[dict]:
43
43
  """Fetches fields of a given sobject"""
44
44
  query = SOBJECT_FIELDS_QUERY_TPL.format(
45
45
  entity_definition_id=sobject_name
@@ -55,7 +55,7 @@ class SalesforceClient(SalesforceBaseClient):
55
55
  return None
56
56
  return response["records"][0]["Description"]
57
57
 
58
- def add_table_descriptions(self, sobjects: List[dict]) -> List[dict]:
58
+ def add_table_descriptions(self, sobjects: list[dict]) -> list[dict]:
59
59
  """
60
60
  Add table descriptions.
61
61
  We use the tooling API which does not handle well the LIMIT in SOQL
@@ -67,7 +67,7 @@ class SalesforceClient(SalesforceBaseClient):
67
67
  described_sobjects.append({**sobject, "Description": description})
68
68
  return described_sobjects
69
69
 
70
- def tables(self) -> List[dict]:
70
+ def tables(self) -> list[dict]:
71
71
  """
72
72
  Get Salesforce sobjects as tables
73
73
  """
@@ -77,13 +77,13 @@ class SalesforceClient(SalesforceBaseClient):
77
77
  return list(self.formatter.tables(described_sobjects))
78
78
 
79
79
  def columns(
80
- self, sobject_names: List[Tuple[str, str]], show_progress: bool = True
81
- ) -> List[dict]:
80
+ self, sobject_names: list[tuple[str, str]], show_progress: bool = True
81
+ ) -> list[dict]:
82
82
  """
83
83
  Get salesforce sobject fields as columns
84
84
  show_progress: optionally deactivate the tqdm progress bar
85
85
  """
86
- sobject_fields: Dict[str, List[dict]] = dict()
86
+ sobject_fields: dict[str, list[dict]] = dict()
87
87
  for api_name, table_name in tqdm(
88
88
  sobject_names, disable=not show_progress
89
89
  ):
@@ -1,5 +1,4 @@
1
1
  import logging
2
- from typing import Dict, List, Tuple
3
2
 
4
3
  from ...utils import AbstractStorage, LocalStorage, write_summary
5
4
  from ...utils.salesforce import SalesforceCredentials
@@ -14,9 +13,9 @@ from .client import SalesforceClient
14
13
  logger = logging.getLogger(__name__)
15
14
 
16
15
 
17
- Paths = Dict[str, str]
16
+ Paths = dict[str, str]
18
17
 
19
- SALESFORCE_CATALOG_ASSETS: Tuple[WarehouseAsset, ...] = (
18
+ SALESFORCE_CATALOG_ASSETS: tuple[WarehouseAsset, ...] = (
20
19
  WarehouseAsset.TABLE,
21
20
  WarehouseAsset.COLUMN,
22
21
  )
@@ -81,7 +80,7 @@ class SalesforceExtractionProcessor:
81
80
 
82
81
  def extract_role(self) -> Paths:
83
82
  """extract no users and return the empty file location"""
84
- users: List[dict] = []
83
+ users: list[dict] = []
85
84
  location = self._storage.put(WarehouseAsset.USER.value, users)
86
85
  logger.info(f"Extracted {len(users)} users to {location}")
87
86
  return {WarehouseAsset.USER.value: location}
@@ -1,4 +1,5 @@
1
- from typing import Any, Dict, Iterator, List
1
+ from collections.abc import Iterator
2
+ from typing import Any
2
3
 
3
4
  from ...utils import group_by
4
5
  from .constants import SCHEMA_NAME
@@ -25,10 +26,10 @@ def _name(sobject: dict) -> str:
25
26
  return f"{label} ({api_name})"
26
27
 
27
28
 
28
- def _field_description(field: Dict[str, Any]) -> str:
29
- context: Dict[str, str] = {}
29
+ def _field_description(field: dict[str, Any]) -> str:
30
+ context: dict[str, str] = {}
30
31
 
31
- field_definition: Dict[str, str] = field.get("FieldDefinition") or {}
32
+ field_definition: dict[str, str] = field.get("FieldDefinition") or {}
32
33
  if description := field_definition.get("Description"):
33
34
  context["Description"] = _clean(description)
34
35
  if help_text := field.get("InlineHelpText"):
@@ -69,7 +70,7 @@ def _to_table_payload(sobject: dict) -> dict:
69
70
  }
70
71
 
71
72
 
72
- def _detect_duplicates(sobjects: List[dict]) -> List[dict]:
73
+ def _detect_duplicates(sobjects: list[dict]) -> list[dict]:
73
74
  """
74
75
  enrich the given data with "has_duplicate" flag:
75
76
  - True when another asset has the same Label in the list
@@ -89,7 +90,7 @@ class SalesforceFormatter:
89
90
  """
90
91
 
91
92
  @staticmethod
92
- def tables(sobjects: List[dict]) -> Iterator[dict]:
93
+ def tables(sobjects: list[dict]) -> Iterator[dict]:
93
94
  """
94
95
  formats the raw list of sobjects to tables
95
96
  """
@@ -98,7 +99,7 @@ class SalesforceFormatter:
98
99
  yield _to_table_payload(sobject)
99
100
 
100
101
  @staticmethod
101
- def columns(sobject_fields: Dict[str, List[dict]]) -> Iterator[dict]:
102
+ def columns(sobject_fields: dict[str, list[dict]]) -> Iterator[dict]:
102
103
  """formats the raw list of sobject fields to columns"""
103
104
  for table_name, fields in sobject_fields.items():
104
105
  fields = _detect_duplicates(fields)
@@ -1,5 +1,3 @@
1
- from typing import Dict, List, Tuple
2
-
3
1
  from .format import (
4
2
  _HAS_DUPLICATE_KEY,
5
3
  SalesforceFormatter,
@@ -9,7 +7,7 @@ from .format import (
9
7
  )
10
8
 
11
9
 
12
- def _tables_sobjects() -> Tuple[Dict[str, str], ...]:
10
+ def _tables_sobjects() -> tuple[dict[str, str], ...]:
13
11
  """Returns 4 sobjects with 2 sharing the same label"""
14
12
  a = {"Label": "a", "QualifiedApiName": "a_one"}
15
13
  b = {"Label": "b", "QualifiedApiName": "b"}
@@ -18,7 +16,7 @@ def _tables_sobjects() -> Tuple[Dict[str, str], ...]:
18
16
  return a, b, c, a_prime
19
17
 
20
18
 
21
- def _columns_sobjects() -> Dict[str, List[dict]]:
19
+ def _columns_sobjects() -> dict[str, list[dict]]:
22
20
  a = {"Label": "First Name", "QualifiedApiName": "owner_name"}
23
21
  b = {"Label": "First Name", "QualifiedApiName": "editor_name"}
24
22
  c = {"Label": "Foo Bar", "QualifiedApiName": "foo_bar"}